diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..f5ffdfde7de0f88935abe61e4c65e06681b8edc4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-579/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-618/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-772/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-822/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..804657de33fbff0b7c340fecdc1a207d1298802c --- /dev/null +++ b/README.md @@ -0,0 +1,178 @@ +--- +library_name: peft +license: mit +base_model: THUDM/GLM-4-32B-0414 +tags: +- axolotl +- generated_from_trainer +datasets: +- anthracite-core/magnum-v5-sft-proto-glm4-instruct-rev1 +model-index: +- name: magnum-v5-sft-prototype-glm4-32b-lora + results: [] +--- + + + +[Built with Axolotl](https://github.com/axolotl-ai-cloud/axolotl) +
See axolotl config + +axolotl version: `0.8.0` +```yaml +base_model: THUDM/GLM-4-32B-0414 +#base_model_ignore_patterns: "*/*" +# optionally might have model_type or tokenizer_type +model_type: AutoModelForCausalLM +tokenizer_type: AutoTokenizer +# Automatically upload checkpoint and final model to HF +hub_model_id: anthracite-core/magnum-v5-sft-prototype-glm4-32b-lora +hub_strategy: "all_checkpoints" +push_dataset_to_hub: +hf_use_auth_token: true + + +load_in_8bit: false +load_in_4bit: false +strict: false + +datasets: + - path: anthracite-core/magnum-v5-sft-proto-glm4-instruct-rev1 + ds_type: parquet + type: +shuffle_merged_datasets: true +dataset_prepared_path: ./data/magnum-32b-data +val_set_size: 0.01 +output_dir: ./data/32b-lora-out + +plugins: + - axolotl.integrations.liger.LigerPlugin + - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin +#liger_rope: false +liger_rms_norm: true +liger_layer_norm: true +liger_glu_activation: true +liger_fused_linear_cross_entropy: true +cut_cross_entropy: true + +sequence_len: 32768 +sample_packing: true +eval_sample_packing: true +pad_to_sequence_len: true + +adapter: lora +lora_model_dir: +lora_r: 128 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_linear: true +lora_fan_in_fan_out: +peft_use_rslora: true +lora_modules_to_save: + - embed_tokens + - lm_head + +wandb_project: 32b-magnum-lora +wandb_entity: +wandb_watch: +wandb_name: run4-Lora-0.001-clip +wandb_log_model: + +gradient_accumulation_steps: 2 +micro_batch_size: 1 +num_epochs: 2 +optimizer: paged_ademamix_8bit +lr_scheduler: cosine +learning_rate: 2e-4 +max_grad_norm: 0.001 + +train_on_inputs: false +group_by_length: false +bf16: auto +fp16: +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: +flash_attention: true +s2_attention: + +warmup_steps: 40 +evals_per_epoch: 4 +eval_table_size: +eval_max_new_tokens: +saves_per_epoch: 2 +debug: +deepspeed: ./deepspeed_configs/zero3_bf16.json +weight_decay: 0.01 +fsdp: +fsdp_config: +special_tokens: + +``` + +

+ +# magnum-v5-sft-prototype-glm4-32b-lora + +This model is a fine-tuned version of [THUDM/GLM-4-32B-0414](https://huggingface.co/THUDM/GLM-4-32B-0414) on the anthracite-core/magnum-v5-sft-proto-glm4-instruct-rev1 dataset. +It achieves the following results on the evaluation set: +- Loss: 1.1075 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 0.0002 +- train_batch_size: 1 +- eval_batch_size: 1 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 8 +- gradient_accumulation_steps: 2 +- total_train_batch_size: 16 +- total_eval_batch_size: 8 +- optimizer: Use paged_ademamix_8bit and the args are: +No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_steps: 40 +- num_epochs: 2.0 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | +|:-------------:|:------:|:----:|:---------------:| +| 1.3541 | 0.0024 | 1 | 1.3336 | +| 1.1718 | 0.2503 | 103 | 1.1633 | +| 1.1976 | 0.5006 | 206 | 1.1460 | +| 1.095 | 0.7509 | 309 | 1.1339 | +| 1.1076 | 1.0 | 412 | 1.1213 | +| 1.1063 | 1.2503 | 515 | 1.1128 | +| 1.1214 | 1.5006 | 618 | 1.1089 | +| 1.0286 | 1.7509 | 721 | 1.1075 | + + +### Framework versions + +- PEFT 0.15.1 +- Transformers 4.51.3 +- Pytorch 2.6.0+cu124 +- Datasets 3.5.0 +- Tokenizers 0.21.1 \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d23c5bb0164ae65157b73dbb2e6dc419d09b28ad --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,41 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "THUDM/GLM-4-32B-0414", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_up_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": true +} \ No newline at end of file diff --git a/adapter_model.safetensors b/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3bfc934021ae2f94535e9442dcecf9427f7b12c1 --- /dev/null +++ b/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9dabe0dcb2a00ba6eca0b1e4fb714d3c1d5289929ed928c9ab44c923fdb4073 +size 5579575888 diff --git a/checkpoint-579/README.md b/checkpoint-579/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5d958b0327f8e51ca223e270eb789387f6b41fb2 --- /dev/null +++ b/checkpoint-579/README.md @@ -0,0 +1,202 @@ +--- +base_model: THUDM/GLM-4-32B-0414 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/checkpoint-579/adapter_config.json b/checkpoint-579/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d9d8fa4860138947c736b05e4c3dd010601e2671 --- /dev/null +++ b/checkpoint-579/adapter_config.json @@ -0,0 +1,41 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "THUDM/GLM-4-32B-0414", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "k_proj", + "gate_up_proj", + "down_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": true +} \ No newline at end of file diff --git a/checkpoint-579/adapter_model.safetensors b/checkpoint-579/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6eeb8d196189660cb9e111e74276cb5032cc3611 --- /dev/null +++ b/checkpoint-579/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80565d96a79b139f4b14a3ced21fa8604075474e378f633f8d127b0c555c29e8 +size 5579575888 diff --git a/checkpoint-579/global_step579/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-579/global_step579/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..78d06f253477859cecff8f985f19dab627cf4e72 --- /dev/null +++ b/checkpoint-579/global_step579/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48e8ab9ba739bd62cd3ef94481774d288af3ba11a4b5a56079f2ab51f45db23b +size 2458601314 diff --git a/checkpoint-579/global_step579/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-579/global_step579/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c1174a5d7163f76db7590b4ad4d3e2b6ed441625 --- /dev/null +++ b/checkpoint-579/global_step579/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:547540e83d082664d31e6df0aae97c6a282b14a2f5c740a5fb4fa414ec682262 +size 2458601314 diff --git a/checkpoint-579/global_step579/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-579/global_step579/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6520e8da61049060f11bde0d4305111cda4afe6e --- /dev/null +++ b/checkpoint-579/global_step579/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4999177f18b552e941e0cd332fda8f7680c431b4790b6841cb460c113d6343f7 +size 2458601314 diff --git a/checkpoint-579/global_step579/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-579/global_step579/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..96a11f5eec69a321387bac2889a5f2c303588494 --- /dev/null +++ b/checkpoint-579/global_step579/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:280b2a747c884ad982d19970113323be477c314aba0791f90aff38e18ee9c5a1 +size 2458601314 diff --git a/checkpoint-579/global_step579/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-579/global_step579/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe929389ab3d7f46caba34cfb9e5168aeb9bd8f2 --- /dev/null +++ b/checkpoint-579/global_step579/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e77d311a3f54d90a3341f461ae15d6a06d107718bcdb6f4932396687a882936 +size 2458601314 diff --git a/checkpoint-579/global_step579/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-579/global_step579/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c503c3559a028a2a489f974499d97220cce7a187 --- /dev/null +++ b/checkpoint-579/global_step579/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:028d1496caa92ddefb57857eb12c276592f4649d823f2a55734edc2c40a91f73 +size 2458601314 diff --git a/checkpoint-579/global_step579/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-579/global_step579/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..77c5eb699cb56108c59110e041fb60f868b51736 --- /dev/null +++ b/checkpoint-579/global_step579/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:841382c61e7f5ae2f61b61dbc14d0a699c00618877854f2b3ee8779d148e0012 +size 2458601314 diff --git a/checkpoint-579/global_step579/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-579/global_step579/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d0e411fb9dedd864b0b6669b8c5d93f6a5a67142 --- /dev/null +++ b/checkpoint-579/global_step579/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cd5dcdfa379dcab50f833b3906b633af94d611228e675cc7f086a04d8b32329 +size 2458601314 diff --git a/checkpoint-579/global_step579/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-579/global_step579/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ad518f276b9deb5d3acc0b20e8892bad4130da9 --- /dev/null +++ b/checkpoint-579/global_step579/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a57e502191a0500b9d1c0f16cbf9b0199741c167c4ce17e8f2b6d4593311aa0 +size 747668 diff --git a/checkpoint-579/global_step579/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-579/global_step579/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..88bffa1ee37846a276cd537819c8f28c8f17b8b1 --- /dev/null +++ b/checkpoint-579/global_step579/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1770f917d929eaba8b68921e9bc477fdaf63abe1dfd431ff350a8abc92d1e66 +size 747668 diff --git a/checkpoint-579/global_step579/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-579/global_step579/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..020fa356f6db7120e3491832e4c46c9fce909417 --- /dev/null +++ b/checkpoint-579/global_step579/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30a2eef29639c2c972919eed193d71d81f7be500743aa508a20fbe7b84849d11 +size 747668 diff --git a/checkpoint-579/global_step579/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-579/global_step579/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cfd5b7c09c4ab38527cc09df22d8de40ee0d7ef --- /dev/null +++ b/checkpoint-579/global_step579/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ac3c8d1e4ce2007a511263e7a05b661ae353592545b19274f21da30669e3cad +size 747668 diff --git a/checkpoint-579/global_step579/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-579/global_step579/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd508f247f80c27b081f6cad8fdc0daedd168450 --- /dev/null +++ b/checkpoint-579/global_step579/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c4d97f6979f8f733d796a5c58e564596ab2456e618e44a25cf9ace722c64a73 +size 747668 diff --git a/checkpoint-579/global_step579/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-579/global_step579/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..57995b565610be7090e12ba1a209b58f121e5c66 --- /dev/null +++ b/checkpoint-579/global_step579/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56a5a4c0a626c1fe61e7a4db142b78cba919bf3f484a5c494933437f0d774cc1 +size 747668 diff --git a/checkpoint-579/global_step579/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-579/global_step579/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e34ff54956729aef29800db4956cc690764d01e --- /dev/null +++ b/checkpoint-579/global_step579/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6aeec15becc8fc42faa3f0575c51ef364181d2665459eddc8d2dc966b403020 +size 747668 diff --git a/checkpoint-579/global_step579/zero_pp_rank_7_mp_rank_00_model_states.pt b/checkpoint-579/global_step579/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eada1b68299ed58e210d0b97eb369e27e8da8881 --- /dev/null +++ b/checkpoint-579/global_step579/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:705f93be649ce8d35b21383ed30847f7ffe67a8b1c37e6b0ba22ede8acda0310 +size 747668 diff --git a/checkpoint-579/latest b/checkpoint-579/latest new file mode 100644 index 0000000000000000000000000000000000000000..ca469d91f6dc5559091863658d35124fe4c6a737 --- /dev/null +++ b/checkpoint-579/latest @@ -0,0 +1 @@ +global_step579 \ No newline at end of file diff --git a/checkpoint-579/rng_state_0.pth b/checkpoint-579/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..78ea69bf77bc9a540866bb9542e61b9deec3a3fe --- /dev/null +++ b/checkpoint-579/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38c04bc7b73c7a7d50ea429c6932b3c02ee97bad0a60bc0571bce7889d378963 +size 15984 diff --git a/checkpoint-579/rng_state_1.pth b/checkpoint-579/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..08b3f9fd9da2159be3018e4f8c90ebc74fb5a928 --- /dev/null +++ b/checkpoint-579/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31217f02580556ec125eca275e53614326586d6949699267269156e796c66602 +size 15984 diff --git a/checkpoint-579/rng_state_2.pth b/checkpoint-579/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..ac55da7b4f89118e261157974e4446dcd655378e --- /dev/null +++ b/checkpoint-579/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a05785e9c45b375c31236f1e23d47f249ad4638fee36b5efedd9a8689e393677 +size 15984 diff --git a/checkpoint-579/rng_state_3.pth b/checkpoint-579/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..29c693b5a30633dc93355e02804a1084b69ee8f6 --- /dev/null +++ b/checkpoint-579/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a44490d89fcd7a0d4c86bd5dba58d6d5df0722673a204f059fdccba9d833240 +size 15984 diff --git a/checkpoint-579/rng_state_4.pth b/checkpoint-579/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..16769eb8e1f615c0823adb92b1402f1dcfab4b79 --- /dev/null +++ b/checkpoint-579/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:458fcb8dbc7e1b3dd98288e9a49357b31abb705d49b87c597778a99a064a3349 +size 15984 diff --git a/checkpoint-579/rng_state_5.pth b/checkpoint-579/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..e52e4a4f125a359879f3db83a2fd95f8a6163cd0 --- /dev/null +++ b/checkpoint-579/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb0b12c61dd87f425696599697c6a6601f719556cb7f3bd2ca6f1cd6bbb836e5 +size 15984 diff --git a/checkpoint-579/rng_state_6.pth b/checkpoint-579/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..6e9d62e251af62e249950da700f94e7c125e05d6 --- /dev/null +++ b/checkpoint-579/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cb73b7f37f6a6e5ff92e836afc1a8be3a31ef4cafb9feae0870d034fa0e871e +size 15984 diff --git a/checkpoint-579/rng_state_7.pth b/checkpoint-579/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..6b5ec5c88c370f2afdf7d9f98c9c231de26d6f7e --- /dev/null +++ b/checkpoint-579/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5b10ca9cc6239fd14cf357acb50f08aecddecb3971e9cd2e8dd3be177dae75b +size 15984 diff --git a/checkpoint-579/scheduler.pt b/checkpoint-579/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..971728ecba22ed809a021e2ed07e4fe42fc08910 --- /dev/null +++ b/checkpoint-579/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f6cbd64ca821dc70aa12c5bad69ea8779cfa628a03651d12c29acb5503462db +size 1064 diff --git a/checkpoint-579/special_tokens_map.json b/checkpoint-579/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3cf953c2c8a3c89778c92e54c685942bb1130616 --- /dev/null +++ b/checkpoint-579/special_tokens_map.json @@ -0,0 +1,32 @@ +{ + "additional_special_tokens": [ + "<|endoftext|>", + "[MASK]", + "[gMASK]", + "[sMASK]", + "", + "", + "<|system|>", + "<|user|>", + "<|assistant|>", + "<|observation|>", + "<|begin_of_image|>", + "<|end_of_image|>", + "<|begin_of_video|>", + "<|end_of_video|>" + ], + "eos_token": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-579/tokenizer.json b/checkpoint-579/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..4d1dde37a2715c11628fc84bf571976f9f80eb69 --- /dev/null +++ b/checkpoint-579/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76ebeac0d8bd7879ead7b43c16b44981f277e47225de2bd7de9ae1a6cc664a8c +size 19966496 diff --git a/checkpoint-579/tokenizer_config.json b/checkpoint-579/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e19c45f6310a17f6c1c6be76a429ac68438d547f --- /dev/null +++ b/checkpoint-579/tokenizer_config.json @@ -0,0 +1,146 @@ +{ + "added_tokens_decoder": { + "151329": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151330": { + "content": "[MASK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151331": { + "content": "[gMASK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151332": { + "content": "[sMASK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151333": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151334": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151335": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151336": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151337": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151338": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151339": { + "content": "<|begin_of_image|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151340": { + "content": "<|end_of_image|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151341": { + "content": "<|begin_of_video|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151342": { + "content": "<|end_of_video|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|endoftext|>", + "[MASK]", + "[gMASK]", + "[sMASK]", + "", + "", + "<|system|>", + "<|user|>", + "<|assistant|>", + "<|observation|>", + "<|begin_of_image|>", + "<|end_of_image|>", + "<|begin_of_video|>", + "<|end_of_video|>" + ], + "chat_template": "[gMASK]\n{%- if tools -%}\n<|system|>\n# 可用工具\n{% for tool in tools %}\n {%- set function = tool.function if tool.get(\"function\") else tool %}\n\n## {{ function.name }}\n\n{{ function | tojson(indent=4, ensure_ascii=False) }}\n在调用上述函数时,请使用 Json 格式表示调用的参数。\n{%- endfor %}\n{%- endif -%}\n\n{%- for msg in messages %}\n {%- if msg.role == 'system' %}\n<|system|>\n{{ msg.content }}\n {%- endif %}\n{%- endfor %}\n\n{%- for message in messages if message.role != 'system' %}\n {%- set role = message['role'] %}\n {%- set content = message['content'] %}\n {%- set meta = message.get(\"metadata\", \"\") %}\n\n {%- if role == 'user' %}\n<|user|>\n{{ content }}\n {%- elif role == 'assistant' and not meta %}\n<|assistant|>\n{{ content }}\n {%- elif role == 'assistant' and meta %}\n<|assistant|>{{ meta }}\n{{ content }}\n {%- elif role == 'observation' %}\n<|observation|>\n{{ content }}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}<|assistant|>{% endif %}", + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "eos_token": "<|user|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 128000, + "pad_token": "<|endoftext|>", + "padding_side": "left", + "remove_space": false, + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-579/trainer_state.json b/checkpoint-579/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6df763ccb52fd28c798f148e5b42cdeb7a0dd3d6 --- /dev/null +++ b/checkpoint-579/trainer_state.json @@ -0,0 +1,4135 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.5, + "eval_steps": 97, + "global_step": 579, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0025906735751295338, + "grad_norm": 758.2562349755826, + "learning_rate": 0.0, + "loss": 1.3719, + "step": 1 + }, + { + "epoch": 0.0025906735751295338, + "eval_loss": 1.3159157037734985, + "eval_runtime": 36.907, + "eval_samples_per_second": 20.159, + "eval_steps_per_second": 1.273, + "step": 1 + }, + { + "epoch": 0.0051813471502590676, + "grad_norm": 666.308184823038, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.36, + "step": 2 + }, + { + "epoch": 0.007772020725388601, + "grad_norm": 211.0771195353068, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.3746, + "step": 3 + }, + { + "epoch": 0.010362694300518135, + "grad_norm": 431.5114709683218, + "learning_rate": 3e-06, + "loss": 1.3412, + "step": 4 + }, + { + "epoch": 0.012953367875647668, + "grad_norm": 230.87468433791625, + "learning_rate": 4.000000000000001e-06, + "loss": 1.3837, + "step": 5 + }, + { + "epoch": 0.015544041450777202, + "grad_norm": 635.1636587738542, + "learning_rate": 5e-06, + "loss": 1.3761, + "step": 6 + }, + { + "epoch": 0.018134715025906734, + "grad_norm": 791.5536958334704, + "learning_rate": 6e-06, + "loss": 1.2855, + "step": 7 + }, + { + "epoch": 0.02072538860103627, + "grad_norm": 667.7197994216477, + "learning_rate": 7e-06, + "loss": 1.3267, + "step": 8 + }, + { + "epoch": 0.023316062176165803, + "grad_norm": 254.3855973692125, + "learning_rate": 8.000000000000001e-06, + "loss": 1.2977, + "step": 9 + }, + { + "epoch": 0.025906735751295335, + "grad_norm": 162.29347257682093, + "learning_rate": 9e-06, + "loss": 1.3522, + "step": 10 + }, + { + "epoch": 0.02849740932642487, + "grad_norm": 352.6352930651456, + "learning_rate": 1e-05, + "loss": 1.2688, + "step": 11 + }, + { + "epoch": 0.031088082901554404, + "grad_norm": 148.2629265526552, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.3342, + "step": 12 + }, + { + "epoch": 0.03367875647668394, + "grad_norm": 249.88753789723657, + "learning_rate": 1.2e-05, + "loss": 1.2983, + "step": 13 + }, + { + "epoch": 0.03626943005181347, + "grad_norm": 184.03358422636597, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.3291, + "step": 14 + }, + { + "epoch": 0.038860103626943004, + "grad_norm": 198.4491469860763, + "learning_rate": 1.4e-05, + "loss": 1.4014, + "step": 15 + }, + { + "epoch": 0.04145077720207254, + "grad_norm": 680.9537058769038, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.3775, + "step": 16 + }, + { + "epoch": 0.04404145077720207, + "grad_norm": 563.0247638614801, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.3228, + "step": 17 + }, + { + "epoch": 0.046632124352331605, + "grad_norm": 271.985463813746, + "learning_rate": 1.7e-05, + "loss": 1.3695, + "step": 18 + }, + { + "epoch": 0.04922279792746114, + "grad_norm": 399.51218452223316, + "learning_rate": 1.8e-05, + "loss": 1.2556, + "step": 19 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 160.70697055826656, + "learning_rate": 1.9e-05, + "loss": 1.2982, + "step": 20 + }, + { + "epoch": 0.054404145077720206, + "grad_norm": 227.8927504687491, + "learning_rate": 2e-05, + "loss": 1.3532, + "step": 21 + }, + { + "epoch": 0.05699481865284974, + "grad_norm": 550.1538868076032, + "learning_rate": 2.1000000000000002e-05, + "loss": 1.2603, + "step": 22 + }, + { + "epoch": 0.05958549222797927, + "grad_norm": 291.8994359919024, + "learning_rate": 2.2000000000000003e-05, + "loss": 1.3663, + "step": 23 + }, + { + "epoch": 0.06217616580310881, + "grad_norm": 120.60677833129643, + "learning_rate": 2.3e-05, + "loss": 1.3129, + "step": 24 + }, + { + "epoch": 0.06476683937823834, + "grad_norm": 414.4006662101242, + "learning_rate": 2.4e-05, + "loss": 1.3037, + "step": 25 + }, + { + "epoch": 0.06735751295336788, + "grad_norm": 141.48324465317884, + "learning_rate": 2.5e-05, + "loss": 1.3095, + "step": 26 + }, + { + "epoch": 0.06994818652849741, + "grad_norm": 147.86066819937994, + "learning_rate": 2.6000000000000002e-05, + "loss": 1.2372, + "step": 27 + }, + { + "epoch": 0.07253886010362694, + "grad_norm": 214.47337614964576, + "learning_rate": 2.7000000000000002e-05, + "loss": 1.3384, + "step": 28 + }, + { + "epoch": 0.07512953367875648, + "grad_norm": 898.4324889241673, + "learning_rate": 2.8e-05, + "loss": 1.2003, + "step": 29 + }, + { + "epoch": 0.07772020725388601, + "grad_norm": 128.83026557596128, + "learning_rate": 2.9e-05, + "loss": 1.2172, + "step": 30 + }, + { + "epoch": 0.08031088082901554, + "grad_norm": 183.0777862405529, + "learning_rate": 3.0000000000000004e-05, + "loss": 1.2674, + "step": 31 + }, + { + "epoch": 0.08290155440414508, + "grad_norm": 119.01841833358732, + "learning_rate": 3.1e-05, + "loss": 1.2554, + "step": 32 + }, + { + "epoch": 0.08549222797927461, + "grad_norm": 117.65980267542858, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.2716, + "step": 33 + }, + { + "epoch": 0.08808290155440414, + "grad_norm": 82.40151099433953, + "learning_rate": 3.3e-05, + "loss": 1.2019, + "step": 34 + }, + { + "epoch": 0.09067357512953368, + "grad_norm": 82.61816783653785, + "learning_rate": 3.4e-05, + "loss": 1.2424, + "step": 35 + }, + { + "epoch": 0.09326424870466321, + "grad_norm": 136.42743433868276, + "learning_rate": 3.5000000000000004e-05, + "loss": 1.2066, + "step": 36 + }, + { + "epoch": 0.09585492227979274, + "grad_norm": 36.775911657584444, + "learning_rate": 3.6e-05, + "loss": 1.2485, + "step": 37 + }, + { + "epoch": 0.09844559585492228, + "grad_norm": 56.55022603284064, + "learning_rate": 3.7000000000000005e-05, + "loss": 1.2112, + "step": 38 + }, + { + "epoch": 0.10103626943005181, + "grad_norm": 50.09896932886107, + "learning_rate": 3.8e-05, + "loss": 1.2027, + "step": 39 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 54.2661481198025, + "learning_rate": 3.9e-05, + "loss": 1.2673, + "step": 40 + }, + { + "epoch": 0.10621761658031088, + "grad_norm": 60.04145981731815, + "learning_rate": 4e-05, + "loss": 1.1648, + "step": 41 + }, + { + "epoch": 0.10880829015544041, + "grad_norm": 169.47741055545822, + "learning_rate": 3.999981580539036e-05, + "loss": 1.2393, + "step": 42 + }, + { + "epoch": 0.11139896373056994, + "grad_norm": 43.64716987307323, + "learning_rate": 3.9999263224954204e-05, + "loss": 1.2906, + "step": 43 + }, + { + "epoch": 0.11398963730569948, + "grad_norm": 51.3206609767585, + "learning_rate": 3.999834226886976e-05, + "loss": 1.1807, + "step": 44 + }, + { + "epoch": 0.11658031088082901, + "grad_norm": 38.95055887413869, + "learning_rate": 3.999705295410054e-05, + "loss": 1.1825, + "step": 45 + }, + { + "epoch": 0.11917098445595854, + "grad_norm": 40.59968974426338, + "learning_rate": 3.999539530439504e-05, + "loss": 1.193, + "step": 46 + }, + { + "epoch": 0.12176165803108809, + "grad_norm": 34.5796571445333, + "learning_rate": 3.9993369350286265e-05, + "loss": 1.2127, + "step": 47 + }, + { + "epoch": 0.12435233160621761, + "grad_norm": 37.97693356149241, + "learning_rate": 3.99909751290912e-05, + "loss": 1.1543, + "step": 48 + }, + { + "epoch": 0.12694300518134716, + "grad_norm": 82.9217015858092, + "learning_rate": 3.9988212684910107e-05, + "loss": 1.2329, + "step": 49 + }, + { + "epoch": 0.12953367875647667, + "grad_norm": 49.256542144400214, + "learning_rate": 3.9985082068625724e-05, + "loss": 1.212, + "step": 50 + }, + { + "epoch": 0.13212435233160622, + "grad_norm": 45.025980435259484, + "learning_rate": 3.998158333790231e-05, + "loss": 1.2129, + "step": 51 + }, + { + "epoch": 0.13471502590673576, + "grad_norm": 45.98465689592428, + "learning_rate": 3.99777165571846e-05, + "loss": 1.1709, + "step": 52 + }, + { + "epoch": 0.13730569948186527, + "grad_norm": 43.481241408477906, + "learning_rate": 3.997348179769661e-05, + "loss": 1.1614, + "step": 53 + }, + { + "epoch": 0.13989637305699482, + "grad_norm": 82.17633750834132, + "learning_rate": 3.996887913744033e-05, + "loss": 1.2205, + "step": 54 + }, + { + "epoch": 0.14248704663212436, + "grad_norm": 53.0176514970764, + "learning_rate": 3.9963908661194285e-05, + "loss": 1.1204, + "step": 55 + }, + { + "epoch": 0.14507772020725387, + "grad_norm": 67.86382426995611, + "learning_rate": 3.995857046051196e-05, + "loss": 1.1839, + "step": 56 + }, + { + "epoch": 0.14766839378238342, + "grad_norm": 31.282407703790597, + "learning_rate": 3.995286463372013e-05, + "loss": 1.2126, + "step": 57 + }, + { + "epoch": 0.15025906735751296, + "grad_norm": 52.200764429265604, + "learning_rate": 3.994679128591706e-05, + "loss": 1.2036, + "step": 58 + }, + { + "epoch": 0.15284974093264247, + "grad_norm": 60.706608653531895, + "learning_rate": 3.9940350528970535e-05, + "loss": 1.1848, + "step": 59 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 47.31754062899529, + "learning_rate": 3.993354248151583e-05, + "loss": 1.0869, + "step": 60 + }, + { + "epoch": 0.15803108808290156, + "grad_norm": 49.42450836392811, + "learning_rate": 3.9926367268953514e-05, + "loss": 1.2651, + "step": 61 + }, + { + "epoch": 0.16062176165803108, + "grad_norm": 38.791167030088886, + "learning_rate": 3.991882502344712e-05, + "loss": 1.1881, + "step": 62 + }, + { + "epoch": 0.16321243523316062, + "grad_norm": 56.16339499737216, + "learning_rate": 3.991091588392077e-05, + "loss": 1.1518, + "step": 63 + }, + { + "epoch": 0.16580310880829016, + "grad_norm": 861.8559063020828, + "learning_rate": 3.990263999605652e-05, + "loss": 1.1614, + "step": 64 + }, + { + "epoch": 0.16839378238341968, + "grad_norm": 50.92822786500888, + "learning_rate": 3.989399751229179e-05, + "loss": 1.1998, + "step": 65 + }, + { + "epoch": 0.17098445595854922, + "grad_norm": 31.04121324055666, + "learning_rate": 3.988498859181645e-05, + "loss": 1.1795, + "step": 66 + }, + { + "epoch": 0.17357512953367876, + "grad_norm": 50.33061983380845, + "learning_rate": 3.9875613400569975e-05, + "loss": 1.1742, + "step": 67 + }, + { + "epoch": 0.17616580310880828, + "grad_norm": 75.20462514003519, + "learning_rate": 3.986587211123833e-05, + "loss": 1.1856, + "step": 68 + }, + { + "epoch": 0.17875647668393782, + "grad_norm": 38.82139317052205, + "learning_rate": 3.98557649032508e-05, + "loss": 1.1529, + "step": 69 + }, + { + "epoch": 0.18134715025906736, + "grad_norm": 36.55988806615175, + "learning_rate": 3.984529196277674e-05, + "loss": 1.1884, + "step": 70 + }, + { + "epoch": 0.18393782383419688, + "grad_norm": 104.8931793971097, + "learning_rate": 3.983445348272203e-05, + "loss": 1.2182, + "step": 71 + }, + { + "epoch": 0.18652849740932642, + "grad_norm": 36.50395409234617, + "learning_rate": 3.982324966272566e-05, + "loss": 1.1609, + "step": 72 + }, + { + "epoch": 0.18911917098445596, + "grad_norm": 35.019191693448626, + "learning_rate": 3.981168070915594e-05, + "loss": 1.173, + "step": 73 + }, + { + "epoch": 0.19170984455958548, + "grad_norm": 33.378390048053596, + "learning_rate": 3.979974683510677e-05, + "loss": 1.173, + "step": 74 + }, + { + "epoch": 0.19430051813471502, + "grad_norm": 43.356840136984154, + "learning_rate": 3.978744826039366e-05, + "loss": 1.2032, + "step": 75 + }, + { + "epoch": 0.19689119170984457, + "grad_norm": 31.285725922510768, + "learning_rate": 3.977478521154974e-05, + "loss": 1.1569, + "step": 76 + }, + { + "epoch": 0.19948186528497408, + "grad_norm": 35.19264482867074, + "learning_rate": 3.9761757921821544e-05, + "loss": 1.1365, + "step": 77 + }, + { + "epoch": 0.20207253886010362, + "grad_norm": 44.66037256551279, + "learning_rate": 3.974836663116472e-05, + "loss": 1.164, + "step": 78 + }, + { + "epoch": 0.20466321243523317, + "grad_norm": 68.91101457952654, + "learning_rate": 3.973461158623963e-05, + "loss": 1.2256, + "step": 79 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 45.866521854583, + "learning_rate": 3.9720493040406786e-05, + "loss": 1.1697, + "step": 80 + }, + { + "epoch": 0.20984455958549222, + "grad_norm": 59.63095169617338, + "learning_rate": 3.970601125372218e-05, + "loss": 1.2094, + "step": 81 + }, + { + "epoch": 0.21243523316062177, + "grad_norm": 39.085597271064216, + "learning_rate": 3.9691166492932535e-05, + "loss": 1.1048, + "step": 82 + }, + { + "epoch": 0.21502590673575128, + "grad_norm": 36.40256073477861, + "learning_rate": 3.9675959031470336e-05, + "loss": 1.248, + "step": 83 + }, + { + "epoch": 0.21761658031088082, + "grad_norm": 29.846921716586085, + "learning_rate": 3.966038914944881e-05, + "loss": 1.1718, + "step": 84 + }, + { + "epoch": 0.22020725388601037, + "grad_norm": 50.87052190327881, + "learning_rate": 3.964445713365682e-05, + "loss": 1.1529, + "step": 85 + }, + { + "epoch": 0.22279792746113988, + "grad_norm": 35.32915760431302, + "learning_rate": 3.9628163277553486e-05, + "loss": 1.1767, + "step": 86 + }, + { + "epoch": 0.22538860103626943, + "grad_norm": 157.5587514654703, + "learning_rate": 3.961150788126286e-05, + "loss": 1.2194, + "step": 87 + }, + { + "epoch": 0.22797927461139897, + "grad_norm": 25.03485489120971, + "learning_rate": 3.9594491251568376e-05, + "loss": 1.1392, + "step": 88 + }, + { + "epoch": 0.23056994818652848, + "grad_norm": 80.55933867045263, + "learning_rate": 3.957711370190716e-05, + "loss": 1.1819, + "step": 89 + }, + { + "epoch": 0.23316062176165803, + "grad_norm": 272.22874004071406, + "learning_rate": 3.9559375552364325e-05, + "loss": 1.0998, + "step": 90 + }, + { + "epoch": 0.23575129533678757, + "grad_norm": 91.94671663482514, + "learning_rate": 3.954127712966702e-05, + "loss": 1.2494, + "step": 91 + }, + { + "epoch": 0.23834196891191708, + "grad_norm": 54.31533598131098, + "learning_rate": 3.952281876717843e-05, + "loss": 1.1385, + "step": 92 + }, + { + "epoch": 0.24093264248704663, + "grad_norm": 103.20789745908105, + "learning_rate": 3.950400080489165e-05, + "loss": 1.1398, + "step": 93 + }, + { + "epoch": 0.24352331606217617, + "grad_norm": 45.14746362545893, + "learning_rate": 3.94848235894234e-05, + "loss": 1.2697, + "step": 94 + }, + { + "epoch": 0.24611398963730569, + "grad_norm": 21.271923336142002, + "learning_rate": 3.9465287474007654e-05, + "loss": 1.1397, + "step": 95 + }, + { + "epoch": 0.24870466321243523, + "grad_norm": 93.89786795431422, + "learning_rate": 3.944539281848912e-05, + "loss": 1.1542, + "step": 96 + }, + { + "epoch": 0.25129533678756477, + "grad_norm": 32.38768349342839, + "learning_rate": 3.942513998931663e-05, + "loss": 1.1693, + "step": 97 + }, + { + "epoch": 0.25129533678756477, + "eval_loss": 1.1344976425170898, + "eval_runtime": 37.8807, + "eval_samples_per_second": 19.641, + "eval_steps_per_second": 1.241, + "step": 97 + }, + { + "epoch": 0.2538860103626943, + "grad_norm": 91.41293468177638, + "learning_rate": 3.940452935953639e-05, + "loss": 1.1724, + "step": 98 + }, + { + "epoch": 0.25647668393782386, + "grad_norm": 39.20645478419229, + "learning_rate": 3.9383561308785075e-05, + "loss": 1.1583, + "step": 99 + }, + { + "epoch": 0.25906735751295334, + "grad_norm": 35.32804513153546, + "learning_rate": 3.9362236223282885e-05, + "loss": 1.158, + "step": 100 + }, + { + "epoch": 0.2616580310880829, + "grad_norm": 35.24783762804842, + "learning_rate": 3.934055449582641e-05, + "loss": 1.1552, + "step": 101 + }, + { + "epoch": 0.26424870466321243, + "grad_norm": 33.743808031979775, + "learning_rate": 3.931851652578137e-05, + "loss": 1.264, + "step": 102 + }, + { + "epoch": 0.266839378238342, + "grad_norm": 113.49798793226394, + "learning_rate": 3.92961227190753e-05, + "loss": 1.2361, + "step": 103 + }, + { + "epoch": 0.2694300518134715, + "grad_norm": 31.813807349410364, + "learning_rate": 3.9273373488190036e-05, + "loss": 1.1246, + "step": 104 + }, + { + "epoch": 0.27202072538860106, + "grad_norm": 29.391695486306187, + "learning_rate": 3.925026925215417e-05, + "loss": 1.1142, + "step": 105 + }, + { + "epoch": 0.27461139896373055, + "grad_norm": 33.79933331839905, + "learning_rate": 3.922681043653526e-05, + "loss": 1.1401, + "step": 106 + }, + { + "epoch": 0.2772020725388601, + "grad_norm": 39.09509012730907, + "learning_rate": 3.920299747343204e-05, + "loss": 1.1822, + "step": 107 + }, + { + "epoch": 0.27979274611398963, + "grad_norm": 37.81471938433609, + "learning_rate": 3.9178830801466465e-05, + "loss": 1.1592, + "step": 108 + }, + { + "epoch": 0.2823834196891192, + "grad_norm": 69.07753778460207, + "learning_rate": 3.915431086577561e-05, + "loss": 1.1683, + "step": 109 + }, + { + "epoch": 0.2849740932642487, + "grad_norm": 28.864787246081605, + "learning_rate": 3.912943811800347e-05, + "loss": 1.1179, + "step": 110 + }, + { + "epoch": 0.28756476683937826, + "grad_norm": 28.842042951717836, + "learning_rate": 3.910421301629264e-05, + "loss": 1.1317, + "step": 111 + }, + { + "epoch": 0.29015544041450775, + "grad_norm": 51.475482074695506, + "learning_rate": 3.9078636025275904e-05, + "loss": 1.1451, + "step": 112 + }, + { + "epoch": 0.2927461139896373, + "grad_norm": 33.48279556713943, + "learning_rate": 3.9052707616067654e-05, + "loss": 1.1554, + "step": 113 + }, + { + "epoch": 0.29533678756476683, + "grad_norm": 21.279603575929844, + "learning_rate": 3.9026428266255205e-05, + "loss": 1.1636, + "step": 114 + }, + { + "epoch": 0.2979274611398964, + "grad_norm": 36.226178034876675, + "learning_rate": 3.899979845989003e-05, + "loss": 1.1966, + "step": 115 + }, + { + "epoch": 0.3005181347150259, + "grad_norm": 29.90506353145981, + "learning_rate": 3.897281868747878e-05, + "loss": 1.1888, + "step": 116 + }, + { + "epoch": 0.30310880829015546, + "grad_norm": 36.04602777809767, + "learning_rate": 3.894548944597434e-05, + "loss": 1.2066, + "step": 117 + }, + { + "epoch": 0.30569948186528495, + "grad_norm": 36.42793844948301, + "learning_rate": 3.8917811238766606e-05, + "loss": 1.1712, + "step": 118 + }, + { + "epoch": 0.3082901554404145, + "grad_norm": 58.788967662325696, + "learning_rate": 3.888978457567323e-05, + "loss": 1.1225, + "step": 119 + }, + { + "epoch": 0.31088082901554404, + "grad_norm": 29.357299816022326, + "learning_rate": 3.886140997293024e-05, + "loss": 1.1315, + "step": 120 + }, + { + "epoch": 0.3134715025906736, + "grad_norm": 95.08345317107502, + "learning_rate": 3.883268795318252e-05, + "loss": 1.1852, + "step": 121 + }, + { + "epoch": 0.3160621761658031, + "grad_norm": 33.6623824593179, + "learning_rate": 3.88036190454742e-05, + "loss": 1.16, + "step": 122 + }, + { + "epoch": 0.31865284974093266, + "grad_norm": 42.587546987131105, + "learning_rate": 3.8774203785238886e-05, + "loss": 1.1374, + "step": 123 + }, + { + "epoch": 0.32124352331606215, + "grad_norm": 33.360649853064245, + "learning_rate": 3.8744442714289816e-05, + "loss": 1.1757, + "step": 124 + }, + { + "epoch": 0.3238341968911917, + "grad_norm": 49.09256643961471, + "learning_rate": 3.8714336380809874e-05, + "loss": 1.1782, + "step": 125 + }, + { + "epoch": 0.32642487046632124, + "grad_norm": 31.505007051172793, + "learning_rate": 3.86838853393415e-05, + "loss": 1.195, + "step": 126 + }, + { + "epoch": 0.3290155440414508, + "grad_norm": 34.36735417254799, + "learning_rate": 3.865309015077645e-05, + "loss": 1.1078, + "step": 127 + }, + { + "epoch": 0.3316062176165803, + "grad_norm": 36.63220606142181, + "learning_rate": 3.862195138234551e-05, + "loss": 1.1319, + "step": 128 + }, + { + "epoch": 0.33419689119170987, + "grad_norm": 53.324986862513676, + "learning_rate": 3.859046960760801e-05, + "loss": 1.2301, + "step": 129 + }, + { + "epoch": 0.33678756476683935, + "grad_norm": 47.41445409144979, + "learning_rate": 3.855864540644126e-05, + "loss": 1.2366, + "step": 130 + }, + { + "epoch": 0.3393782383419689, + "grad_norm": 32.57355122427366, + "learning_rate": 3.8526479365029906e-05, + "loss": 1.142, + "step": 131 + }, + { + "epoch": 0.34196891191709844, + "grad_norm": 28.445824333644715, + "learning_rate": 3.849397207585508e-05, + "loss": 1.0847, + "step": 132 + }, + { + "epoch": 0.344559585492228, + "grad_norm": 49.23062726715889, + "learning_rate": 3.846112413768353e-05, + "loss": 1.2241, + "step": 133 + }, + { + "epoch": 0.3471502590673575, + "grad_norm": 53.424206543788074, + "learning_rate": 3.842793615555657e-05, + "loss": 1.2392, + "step": 134 + }, + { + "epoch": 0.34974093264248707, + "grad_norm": 38.19316140175426, + "learning_rate": 3.8394408740778934e-05, + "loss": 1.1208, + "step": 135 + }, + { + "epoch": 0.35233160621761656, + "grad_norm": 32.35931252369273, + "learning_rate": 3.836054251090755e-05, + "loss": 1.1604, + "step": 136 + }, + { + "epoch": 0.3549222797927461, + "grad_norm": 37.90085344799495, + "learning_rate": 3.83263380897401e-05, + "loss": 1.1134, + "step": 137 + }, + { + "epoch": 0.35751295336787564, + "grad_norm": 44.49191588319939, + "learning_rate": 3.829179610730359e-05, + "loss": 1.1281, + "step": 138 + }, + { + "epoch": 0.3601036269430052, + "grad_norm": 141.98524430756757, + "learning_rate": 3.8256917199842715e-05, + "loss": 1.0928, + "step": 139 + }, + { + "epoch": 0.3626943005181347, + "grad_norm": 30.887093976524472, + "learning_rate": 3.822170200980815e-05, + "loss": 1.0936, + "step": 140 + }, + { + "epoch": 0.36528497409326427, + "grad_norm": 21.980521878837745, + "learning_rate": 3.818615118584472e-05, + "loss": 1.1368, + "step": 141 + }, + { + "epoch": 0.36787564766839376, + "grad_norm": 538.6650762618656, + "learning_rate": 3.815026538277943e-05, + "loss": 1.0918, + "step": 142 + }, + { + "epoch": 0.3704663212435233, + "grad_norm": 40.842881572203, + "learning_rate": 3.811404526160943e-05, + "loss": 1.1705, + "step": 143 + }, + { + "epoch": 0.37305699481865284, + "grad_norm": 26.891553492377298, + "learning_rate": 3.8077491489489835e-05, + "loss": 1.1468, + "step": 144 + }, + { + "epoch": 0.3756476683937824, + "grad_norm": 45.138483181178074, + "learning_rate": 3.8040604739721415e-05, + "loss": 1.1679, + "step": 145 + }, + { + "epoch": 0.37823834196891193, + "grad_norm": 35.133763086168244, + "learning_rate": 3.8003385691738227e-05, + "loss": 1.1029, + "step": 146 + }, + { + "epoch": 0.38082901554404147, + "grad_norm": 36.941250802707344, + "learning_rate": 3.7965835031095065e-05, + "loss": 1.1491, + "step": 147 + }, + { + "epoch": 0.38341968911917096, + "grad_norm": 90.1080256703095, + "learning_rate": 3.792795344945485e-05, + "loss": 1.1212, + "step": 148 + }, + { + "epoch": 0.3860103626943005, + "grad_norm": 39.70360899750413, + "learning_rate": 3.7889741644575914e-05, + "loss": 1.15, + "step": 149 + }, + { + "epoch": 0.38860103626943004, + "grad_norm": 28.229369877304094, + "learning_rate": 3.78512003202991e-05, + "loss": 1.1111, + "step": 150 + }, + { + "epoch": 0.3911917098445596, + "grad_norm": 31.611752191925987, + "learning_rate": 3.7812330186534815e-05, + "loss": 1.1366, + "step": 151 + }, + { + "epoch": 0.39378238341968913, + "grad_norm": 38.196015586772425, + "learning_rate": 3.777313195924998e-05, + "loss": 1.1433, + "step": 152 + }, + { + "epoch": 0.3963730569948187, + "grad_norm": 22.732638044547453, + "learning_rate": 3.773360636045481e-05, + "loss": 1.1125, + "step": 153 + }, + { + "epoch": 0.39896373056994816, + "grad_norm": 90.19158665385014, + "learning_rate": 3.7693754118189525e-05, + "loss": 1.1242, + "step": 154 + }, + { + "epoch": 0.4015544041450777, + "grad_norm": 42.43479974993017, + "learning_rate": 3.765357596651095e-05, + "loss": 1.1191, + "step": 155 + }, + { + "epoch": 0.40414507772020725, + "grad_norm": 88.0076735720364, + "learning_rate": 3.761307264547899e-05, + "loss": 1.1718, + "step": 156 + }, + { + "epoch": 0.4067357512953368, + "grad_norm": 30.782507703935767, + "learning_rate": 3.757224490114297e-05, + "loss": 1.109, + "step": 157 + }, + { + "epoch": 0.40932642487046633, + "grad_norm": 69.89871106113397, + "learning_rate": 3.7531093485527943e-05, + "loss": 1.1018, + "step": 158 + }, + { + "epoch": 0.4119170984455959, + "grad_norm": 37.339006645717305, + "learning_rate": 3.7489619156620796e-05, + "loss": 1.1358, + "step": 159 + }, + { + "epoch": 0.41450777202072536, + "grad_norm": 28.06388054378899, + "learning_rate": 3.744782267835632e-05, + "loss": 1.0847, + "step": 160 + }, + { + "epoch": 0.4170984455958549, + "grad_norm": 54.05874281297702, + "learning_rate": 3.740570482060311e-05, + "loss": 1.1682, + "step": 161 + }, + { + "epoch": 0.41968911917098445, + "grad_norm": 32.299093265328835, + "learning_rate": 3.73632663591494e-05, + "loss": 1.1413, + "step": 162 + }, + { + "epoch": 0.422279792746114, + "grad_norm": 31.213652090157694, + "learning_rate": 3.732050807568878e-05, + "loss": 1.1313, + "step": 163 + }, + { + "epoch": 0.42487046632124353, + "grad_norm": 40.01090035937505, + "learning_rate": 3.727743075780578e-05, + "loss": 1.1513, + "step": 164 + }, + { + "epoch": 0.4274611398963731, + "grad_norm": 47.11352577964853, + "learning_rate": 3.723403519896136e-05, + "loss": 1.2192, + "step": 165 + }, + { + "epoch": 0.43005181347150256, + "grad_norm": 28.645086506093037, + "learning_rate": 3.7190322198478355e-05, + "loss": 1.1097, + "step": 166 + }, + { + "epoch": 0.4326424870466321, + "grad_norm": 35.28541113925116, + "learning_rate": 3.7146292561526654e-05, + "loss": 1.1557, + "step": 167 + }, + { + "epoch": 0.43523316062176165, + "grad_norm": 58.30281063037669, + "learning_rate": 3.7101947099108425e-05, + "loss": 1.1829, + "step": 168 + }, + { + "epoch": 0.4378238341968912, + "grad_norm": 26.33563548968379, + "learning_rate": 3.70572866280432e-05, + "loss": 1.147, + "step": 169 + }, + { + "epoch": 0.44041450777202074, + "grad_norm": 57.00052875402651, + "learning_rate": 3.701231197095277e-05, + "loss": 1.1212, + "step": 170 + }, + { + "epoch": 0.4430051813471503, + "grad_norm": 23.672828037237174, + "learning_rate": 3.696702395624608e-05, + "loss": 1.1152, + "step": 171 + }, + { + "epoch": 0.44559585492227977, + "grad_norm": 41.1264174112964, + "learning_rate": 3.692142341810395e-05, + "loss": 1.1154, + "step": 172 + }, + { + "epoch": 0.4481865284974093, + "grad_norm": 26.72177706144361, + "learning_rate": 3.6875511196463715e-05, + "loss": 1.1725, + "step": 173 + }, + { + "epoch": 0.45077720207253885, + "grad_norm": 95.4088800585977, + "learning_rate": 3.682928813700375e-05, + "loss": 1.1339, + "step": 174 + }, + { + "epoch": 0.4533678756476684, + "grad_norm": 34.33666578349465, + "learning_rate": 3.678275509112788e-05, + "loss": 1.1867, + "step": 175 + }, + { + "epoch": 0.45595854922279794, + "grad_norm": 31.032304531003014, + "learning_rate": 3.6735912915949745e-05, + "loss": 1.1386, + "step": 176 + }, + { + "epoch": 0.4585492227979275, + "grad_norm": 55.22043313188224, + "learning_rate": 3.6688762474276945e-05, + "loss": 1.1102, + "step": 177 + }, + { + "epoch": 0.46113989637305697, + "grad_norm": 29.82713377876857, + "learning_rate": 3.6641304634595216e-05, + "loss": 1.1564, + "step": 178 + }, + { + "epoch": 0.4637305699481865, + "grad_norm": 35.71025459541737, + "learning_rate": 3.659354027105238e-05, + "loss": 1.0939, + "step": 179 + }, + { + "epoch": 0.46632124352331605, + "grad_norm": 52.41175655642653, + "learning_rate": 3.6545470263442265e-05, + "loss": 1.1578, + "step": 180 + }, + { + "epoch": 0.4689119170984456, + "grad_norm": 27.682485766528306, + "learning_rate": 3.649709549718849e-05, + "loss": 1.1875, + "step": 181 + }, + { + "epoch": 0.47150259067357514, + "grad_norm": 36.53293663303487, + "learning_rate": 3.6448416863328186e-05, + "loss": 1.1111, + "step": 182 + }, + { + "epoch": 0.4740932642487047, + "grad_norm": 31.45177998538027, + "learning_rate": 3.639943525849555e-05, + "loss": 1.113, + "step": 183 + }, + { + "epoch": 0.47668393782383417, + "grad_norm": 28.323097072885673, + "learning_rate": 3.635015158490533e-05, + "loss": 1.1159, + "step": 184 + }, + { + "epoch": 0.4792746113989637, + "grad_norm": 47.75573754341213, + "learning_rate": 3.6300566750336225e-05, + "loss": 1.1305, + "step": 185 + }, + { + "epoch": 0.48186528497409326, + "grad_norm": 21.384095061494357, + "learning_rate": 3.625068166811418e-05, + "loss": 1.1369, + "step": 186 + }, + { + "epoch": 0.4844559585492228, + "grad_norm": 30.714645036809546, + "learning_rate": 3.6200497257095504e-05, + "loss": 1.1858, + "step": 187 + }, + { + "epoch": 0.48704663212435234, + "grad_norm": 35.12161426399798, + "learning_rate": 3.615001444165001e-05, + "loss": 1.1293, + "step": 188 + }, + { + "epoch": 0.4896373056994819, + "grad_norm": 116.83443661381396, + "learning_rate": 3.6099234151643924e-05, + "loss": 1.1515, + "step": 189 + }, + { + "epoch": 0.49222797927461137, + "grad_norm": 55.47885243409044, + "learning_rate": 3.604815732242283e-05, + "loss": 1.112, + "step": 190 + }, + { + "epoch": 0.4948186528497409, + "grad_norm": 32.332747429034285, + "learning_rate": 3.5996784894794394e-05, + "loss": 1.1661, + "step": 191 + }, + { + "epoch": 0.49740932642487046, + "grad_norm": 33.039210183180046, + "learning_rate": 3.594511781501103e-05, + "loss": 1.1244, + "step": 192 + }, + { + "epoch": 0.5, + "grad_norm": 21.325687337182504, + "learning_rate": 3.58931570347525e-05, + "loss": 1.1634, + "step": 193 + }, + { + "epoch": 0.5025906735751295, + "grad_norm": 51.37599478469561, + "learning_rate": 3.584090351110838e-05, + "loss": 1.2106, + "step": 194 + }, + { + "epoch": 0.5025906735751295, + "eval_loss": 1.1119717359542847, + "eval_runtime": 49.6027, + "eval_samples_per_second": 14.999, + "eval_steps_per_second": 0.948, + "step": 194 + }, + { + "epoch": 0.5051813471502591, + "grad_norm": 42.105169991612456, + "learning_rate": 3.57883582065604e-05, + "loss": 1.1303, + "step": 195 + }, + { + "epoch": 0.5077720207253886, + "grad_norm": 37.14457014578168, + "learning_rate": 3.573552208896474e-05, + "loss": 1.1483, + "step": 196 + }, + { + "epoch": 0.5103626943005182, + "grad_norm": 28.56241612018119, + "learning_rate": 3.568239613153421e-05, + "loss": 1.0843, + "step": 197 + }, + { + "epoch": 0.5129533678756477, + "grad_norm": 35.399304035761865, + "learning_rate": 3.5628981312820315e-05, + "loss": 1.1177, + "step": 198 + }, + { + "epoch": 0.5155440414507773, + "grad_norm": 25.91156850470446, + "learning_rate": 3.557527861669522e-05, + "loss": 1.1215, + "step": 199 + }, + { + "epoch": 0.5181347150259067, + "grad_norm": 43.509516777992324, + "learning_rate": 3.552128903233363e-05, + "loss": 1.1532, + "step": 200 + }, + { + "epoch": 0.5207253886010362, + "grad_norm": 38.18164449834795, + "learning_rate": 3.54670135541946e-05, + "loss": 1.1142, + "step": 201 + }, + { + "epoch": 0.5233160621761658, + "grad_norm": 48.576743289054534, + "learning_rate": 3.541245318200318e-05, + "loss": 1.1152, + "step": 202 + }, + { + "epoch": 0.5259067357512953, + "grad_norm": 38.65411737007163, + "learning_rate": 3.5357608920732e-05, + "loss": 1.1607, + "step": 203 + }, + { + "epoch": 0.5284974093264249, + "grad_norm": 35.663493907396834, + "learning_rate": 3.530248178058282e-05, + "loss": 1.1273, + "step": 204 + }, + { + "epoch": 0.5310880829015544, + "grad_norm": 26.829817821665976, + "learning_rate": 3.5247072776967805e-05, + "loss": 1.1174, + "step": 205 + }, + { + "epoch": 0.533678756476684, + "grad_norm": 39.79604912152638, + "learning_rate": 3.519138293049097e-05, + "loss": 1.1811, + "step": 206 + }, + { + "epoch": 0.5362694300518135, + "grad_norm": 32.26179097390416, + "learning_rate": 3.513541326692925e-05, + "loss": 1.1346, + "step": 207 + }, + { + "epoch": 0.538860103626943, + "grad_norm": 24.35769329902787, + "learning_rate": 3.5079164817213684e-05, + "loss": 1.1061, + "step": 208 + }, + { + "epoch": 0.5414507772020726, + "grad_norm": 26.645546258363844, + "learning_rate": 3.5022638617410396e-05, + "loss": 1.0514, + "step": 209 + }, + { + "epoch": 0.5440414507772021, + "grad_norm": 105.19676603444857, + "learning_rate": 3.496583570870152e-05, + "loss": 1.1474, + "step": 210 + }, + { + "epoch": 0.5466321243523317, + "grad_norm": 61.600623030405885, + "learning_rate": 3.4908757137366006e-05, + "loss": 1.104, + "step": 211 + }, + { + "epoch": 0.5492227979274611, + "grad_norm": 31.65460129853052, + "learning_rate": 3.485140395476038e-05, + "loss": 1.0737, + "step": 212 + }, + { + "epoch": 0.5518134715025906, + "grad_norm": 26.860379117211497, + "learning_rate": 3.4793777217299346e-05, + "loss": 1.1119, + "step": 213 + }, + { + "epoch": 0.5544041450777202, + "grad_norm": 39.89324262309783, + "learning_rate": 3.473587798643633e-05, + "loss": 1.1626, + "step": 214 + }, + { + "epoch": 0.5569948186528497, + "grad_norm": 39.77638257731599, + "learning_rate": 3.467770732864399e-05, + "loss": 1.1545, + "step": 215 + }, + { + "epoch": 0.5595854922279793, + "grad_norm": 30.994657564291458, + "learning_rate": 3.461926631539445e-05, + "loss": 1.1646, + "step": 216 + }, + { + "epoch": 0.5621761658031088, + "grad_norm": 51.99674092516571, + "learning_rate": 3.4560556023139695e-05, + "loss": 1.1638, + "step": 217 + }, + { + "epoch": 0.5647668393782384, + "grad_norm": 58.5132713002146, + "learning_rate": 3.450157753329166e-05, + "loss": 1.1461, + "step": 218 + }, + { + "epoch": 0.5673575129533679, + "grad_norm": 30.712469030418482, + "learning_rate": 3.4442331932202326e-05, + "loss": 1.1583, + "step": 219 + }, + { + "epoch": 0.5699481865284974, + "grad_norm": 47.00217426642832, + "learning_rate": 3.438282031114374e-05, + "loss": 1.1154, + "step": 220 + }, + { + "epoch": 0.572538860103627, + "grad_norm": 37.33927961163222, + "learning_rate": 3.432304376628787e-05, + "loss": 1.1372, + "step": 221 + }, + { + "epoch": 0.5751295336787565, + "grad_norm": 28.858636933974392, + "learning_rate": 3.4263003398686464e-05, + "loss": 1.0488, + "step": 222 + }, + { + "epoch": 0.5777202072538861, + "grad_norm": 37.842230890171486, + "learning_rate": 3.420270031425072e-05, + "loss": 1.1892, + "step": 223 + }, + { + "epoch": 0.5803108808290155, + "grad_norm": 32.65394945357516, + "learning_rate": 3.4142135623730954e-05, + "loss": 1.1218, + "step": 224 + }, + { + "epoch": 0.582901554404145, + "grad_norm": 115.22040829465772, + "learning_rate": 3.4081310442696114e-05, + "loss": 1.1546, + "step": 225 + }, + { + "epoch": 0.5854922279792746, + "grad_norm": 31.20514468446119, + "learning_rate": 3.402022589151325e-05, + "loss": 1.0969, + "step": 226 + }, + { + "epoch": 0.5880829015544041, + "grad_norm": 52.8397361926395, + "learning_rate": 3.395888309532687e-05, + "loss": 1.1218, + "step": 227 + }, + { + "epoch": 0.5906735751295337, + "grad_norm": 51.7991692917308, + "learning_rate": 3.3897283184038215e-05, + "loss": 1.1395, + "step": 228 + }, + { + "epoch": 0.5932642487046632, + "grad_norm": 33.56775233970504, + "learning_rate": 3.3835427292284445e-05, + "loss": 1.1107, + "step": 229 + }, + { + "epoch": 0.5958549222797928, + "grad_norm": 46.081120788214314, + "learning_rate": 3.3773316559417734e-05, + "loss": 1.1472, + "step": 230 + }, + { + "epoch": 0.5984455958549223, + "grad_norm": 41.72558170492288, + "learning_rate": 3.371095212948431e-05, + "loss": 1.1871, + "step": 231 + }, + { + "epoch": 0.6010362694300518, + "grad_norm": 34.27957927587091, + "learning_rate": 3.364833515120336e-05, + "loss": 1.1376, + "step": 232 + }, + { + "epoch": 0.6036269430051814, + "grad_norm": 36.58452602010953, + "learning_rate": 3.358546677794586e-05, + "loss": 1.1885, + "step": 233 + }, + { + "epoch": 0.6062176165803109, + "grad_norm": 28.010809914189192, + "learning_rate": 3.352234816771337e-05, + "loss": 1.102, + "step": 234 + }, + { + "epoch": 0.6088082901554405, + "grad_norm": 24.78419558611963, + "learning_rate": 3.3458980483116664e-05, + "loss": 1.0818, + "step": 235 + }, + { + "epoch": 0.6113989637305699, + "grad_norm": 28.12830040081226, + "learning_rate": 3.3395364891354316e-05, + "loss": 1.1862, + "step": 236 + }, + { + "epoch": 0.6139896373056994, + "grad_norm": 37.94181651161551, + "learning_rate": 3.333150256419127e-05, + "loss": 1.147, + "step": 237 + }, + { + "epoch": 0.616580310880829, + "grad_norm": 21.809518482701854, + "learning_rate": 3.3267394677937134e-05, + "loss": 1.0994, + "step": 238 + }, + { + "epoch": 0.6191709844559585, + "grad_norm": 32.12135773753589, + "learning_rate": 3.320304241342464e-05, + "loss": 1.1531, + "step": 239 + }, + { + "epoch": 0.6217616580310881, + "grad_norm": 51.959731073524054, + "learning_rate": 3.31384469559878e-05, + "loss": 1.1717, + "step": 240 + }, + { + "epoch": 0.6243523316062176, + "grad_norm": 28.045815836372345, + "learning_rate": 3.307360949544012e-05, + "loss": 1.1814, + "step": 241 + }, + { + "epoch": 0.6269430051813472, + "grad_norm": 39.55208384578746, + "learning_rate": 3.300853122605268e-05, + "loss": 1.1483, + "step": 242 + }, + { + "epoch": 0.6295336787564767, + "grad_norm": 29.799974205160808, + "learning_rate": 3.294321334653213e-05, + "loss": 1.1838, + "step": 243 + }, + { + "epoch": 0.6321243523316062, + "grad_norm": 124.31035254102245, + "learning_rate": 3.2877657059998584e-05, + "loss": 1.0698, + "step": 244 + }, + { + "epoch": 0.6347150259067358, + "grad_norm": 37.989925180187655, + "learning_rate": 3.281186357396351e-05, + "loss": 1.0984, + "step": 245 + }, + { + "epoch": 0.6373056994818653, + "grad_norm": 55.72599333657572, + "learning_rate": 3.274583410030745e-05, + "loss": 1.2333, + "step": 246 + }, + { + "epoch": 0.6398963730569949, + "grad_norm": 46.77079456439719, + "learning_rate": 3.267956985525774e-05, + "loss": 1.2157, + "step": 247 + }, + { + "epoch": 0.6424870466321243, + "grad_norm": 33.62329915252562, + "learning_rate": 3.261307205936603e-05, + "loss": 1.1752, + "step": 248 + }, + { + "epoch": 0.6450777202072538, + "grad_norm": 34.11794183225494, + "learning_rate": 3.2546341937485884e-05, + "loss": 1.1265, + "step": 249 + }, + { + "epoch": 0.6476683937823834, + "grad_norm": 36.027636323913896, + "learning_rate": 3.247938071875017e-05, + "loss": 1.103, + "step": 250 + }, + { + "epoch": 0.6502590673575129, + "grad_norm": 35.393219337329946, + "learning_rate": 3.2412189636548456e-05, + "loss": 1.1148, + "step": 251 + }, + { + "epoch": 0.6528497409326425, + "grad_norm": 31.578919022569924, + "learning_rate": 3.234476992850425e-05, + "loss": 1.1149, + "step": 252 + }, + { + "epoch": 0.655440414507772, + "grad_norm": 28.93717647736964, + "learning_rate": 3.227712283645224e-05, + "loss": 1.1425, + "step": 253 + }, + { + "epoch": 0.6580310880829016, + "grad_norm": 34.170026750703684, + "learning_rate": 3.2209249606415394e-05, + "loss": 1.1591, + "step": 254 + }, + { + "epoch": 0.6606217616580311, + "grad_norm": 27.52194954061608, + "learning_rate": 3.214115148858201e-05, + "loss": 1.1704, + "step": 255 + }, + { + "epoch": 0.6632124352331606, + "grad_norm": 81.65404753769732, + "learning_rate": 3.207282973728273e-05, + "loss": 1.161, + "step": 256 + }, + { + "epoch": 0.6658031088082902, + "grad_norm": 57.45351536522683, + "learning_rate": 3.200428561096737e-05, + "loss": 1.116, + "step": 257 + }, + { + "epoch": 0.6683937823834197, + "grad_norm": 30.968529074463714, + "learning_rate": 3.193552037218179e-05, + "loss": 1.1265, + "step": 258 + }, + { + "epoch": 0.6709844559585493, + "grad_norm": 37.8817748068655, + "learning_rate": 3.186653528754464e-05, + "loss": 1.1287, + "step": 259 + }, + { + "epoch": 0.6735751295336787, + "grad_norm": 29.197031189172545, + "learning_rate": 3.179733162772398e-05, + "loss": 1.1045, + "step": 260 + }, + { + "epoch": 0.6761658031088082, + "grad_norm": 36.56253841299107, + "learning_rate": 3.172791066741392e-05, + "loss": 1.1539, + "step": 261 + }, + { + "epoch": 0.6787564766839378, + "grad_norm": 25.799921116950998, + "learning_rate": 3.165827368531113e-05, + "loss": 1.0796, + "step": 262 + }, + { + "epoch": 0.6813471502590673, + "grad_norm": 82.81825216532526, + "learning_rate": 3.1588421964091276e-05, + "loss": 1.142, + "step": 263 + }, + { + "epoch": 0.6839378238341969, + "grad_norm": 31.100074747569124, + "learning_rate": 3.151835679038542e-05, + "loss": 1.0908, + "step": 264 + }, + { + "epoch": 0.6865284974093264, + "grad_norm": 25.57297200703221, + "learning_rate": 3.14480794547563e-05, + "loss": 1.1436, + "step": 265 + }, + { + "epoch": 0.689119170984456, + "grad_norm": 23.92492773149328, + "learning_rate": 3.137759125167455e-05, + "loss": 1.1202, + "step": 266 + }, + { + "epoch": 0.6917098445595855, + "grad_norm": 22.14274360766396, + "learning_rate": 3.130689347949486e-05, + "loss": 1.1113, + "step": 267 + }, + { + "epoch": 0.694300518134715, + "grad_norm": 26.68725288649902, + "learning_rate": 3.123598744043211e-05, + "loss": 1.1517, + "step": 268 + }, + { + "epoch": 0.6968911917098446, + "grad_norm": 25.559817524659362, + "learning_rate": 3.1164874440537295e-05, + "loss": 1.0976, + "step": 269 + }, + { + "epoch": 0.6994818652849741, + "grad_norm": 28.89996834100355, + "learning_rate": 3.109355578967356e-05, + "loss": 1.1932, + "step": 270 + }, + { + "epoch": 0.7020725388601037, + "grad_norm": 32.09658045195569, + "learning_rate": 3.1022032801492e-05, + "loss": 1.1161, + "step": 271 + }, + { + "epoch": 0.7046632124352331, + "grad_norm": 30.623705646213768, + "learning_rate": 3.095030679340751e-05, + "loss": 1.1993, + "step": 272 + }, + { + "epoch": 0.7072538860103627, + "grad_norm": 41.71263710932429, + "learning_rate": 3.0878379086574494e-05, + "loss": 1.1624, + "step": 273 + }, + { + "epoch": 0.7098445595854922, + "grad_norm": 34.68352639470226, + "learning_rate": 3.0806251005862535e-05, + "loss": 1.1156, + "step": 274 + }, + { + "epoch": 0.7124352331606217, + "grad_norm": 23.52580702428812, + "learning_rate": 3.073392387983202e-05, + "loss": 1.0963, + "step": 275 + }, + { + "epoch": 0.7150259067357513, + "grad_norm": 28.10687988214902, + "learning_rate": 3.0661399040709584e-05, + "loss": 1.1095, + "step": 276 + }, + { + "epoch": 0.7176165803108808, + "grad_norm": 66.72288729975841, + "learning_rate": 3.05886778243637e-05, + "loss": 1.0865, + "step": 277 + }, + { + "epoch": 0.7202072538860104, + "grad_norm": 25.775217430321934, + "learning_rate": 3.051576157027998e-05, + "loss": 1.1058, + "step": 278 + }, + { + "epoch": 0.7227979274611399, + "grad_norm": 36.82942099016794, + "learning_rate": 3.0442651621536502e-05, + "loss": 1.1211, + "step": 279 + }, + { + "epoch": 0.7253886010362695, + "grad_norm": 27.878820856521013, + "learning_rate": 3.0369349324779115e-05, + "loss": 1.1471, + "step": 280 + }, + { + "epoch": 0.727979274611399, + "grad_norm": 31.293156717285573, + "learning_rate": 3.0295856030196618e-05, + "loss": 1.0748, + "step": 281 + }, + { + "epoch": 0.7305699481865285, + "grad_norm": 39.315952115194435, + "learning_rate": 3.022217309149588e-05, + "loss": 1.0993, + "step": 282 + }, + { + "epoch": 0.7331606217616581, + "grad_norm": 36.79954071435495, + "learning_rate": 3.0148301865876913e-05, + "loss": 1.1045, + "step": 283 + }, + { + "epoch": 0.7357512953367875, + "grad_norm": 26.127389502147167, + "learning_rate": 3.0074243714007875e-05, + "loss": 1.1424, + "step": 284 + }, + { + "epoch": 0.7383419689119171, + "grad_norm": 25.608778060317068, + "learning_rate": 3.0000000000000004e-05, + "loss": 1.1055, + "step": 285 + }, + { + "epoch": 0.7409326424870466, + "grad_norm": 36.22629669671894, + "learning_rate": 2.992557209138249e-05, + "loss": 1.0845, + "step": 286 + }, + { + "epoch": 0.7435233160621761, + "grad_norm": 35.30642111132886, + "learning_rate": 2.9850961359077293e-05, + "loss": 1.204, + "step": 287 + }, + { + "epoch": 0.7461139896373057, + "grad_norm": 29.765894622087952, + "learning_rate": 2.977616917737388e-05, + "loss": 1.168, + "step": 288 + }, + { + "epoch": 0.7487046632124352, + "grad_norm": 27.194683587397567, + "learning_rate": 2.9701196923903927e-05, + "loss": 1.1236, + "step": 289 + }, + { + "epoch": 0.7512953367875648, + "grad_norm": 63.09779240191165, + "learning_rate": 2.9626045979615928e-05, + "loss": 1.1395, + "step": 290 + }, + { + "epoch": 0.7538860103626943, + "grad_norm": 25.014233377763066, + "learning_rate": 2.9550717728749768e-05, + "loss": 1.1054, + "step": 291 + }, + { + "epoch": 0.7538860103626943, + "eval_loss": 1.0996382236480713, + "eval_runtime": 37.9545, + "eval_samples_per_second": 19.602, + "eval_steps_per_second": 1.238, + "step": 291 + }, + { + "epoch": 0.7564766839378239, + "grad_norm": 27.481891737318097, + "learning_rate": 2.947521355881122e-05, + "loss": 1.1252, + "step": 292 + }, + { + "epoch": 0.7590673575129534, + "grad_norm": 67.57807413949878, + "learning_rate": 2.9399534860546404e-05, + "loss": 1.1761, + "step": 293 + }, + { + "epoch": 0.7616580310880829, + "grad_norm": 65.66834495909988, + "learning_rate": 2.932368302791614e-05, + "loss": 1.0551, + "step": 294 + }, + { + "epoch": 0.7642487046632125, + "grad_norm": 30.051210942517116, + "learning_rate": 2.92476594580703e-05, + "loss": 1.138, + "step": 295 + }, + { + "epoch": 0.7668393782383419, + "grad_norm": 22.693089678510507, + "learning_rate": 2.917146555132206e-05, + "loss": 1.1495, + "step": 296 + }, + { + "epoch": 0.7694300518134715, + "grad_norm": 53.84166280540606, + "learning_rate": 2.909510271112212e-05, + "loss": 1.1409, + "step": 297 + }, + { + "epoch": 0.772020725388601, + "grad_norm": 32.69106061524578, + "learning_rate": 2.9018572344032823e-05, + "loss": 1.1709, + "step": 298 + }, + { + "epoch": 0.7746113989637305, + "grad_norm": 39.44484991312582, + "learning_rate": 2.8941875859702283e-05, + "loss": 1.1138, + "step": 299 + }, + { + "epoch": 0.7772020725388601, + "grad_norm": 31.51857596969122, + "learning_rate": 2.88650146708384e-05, + "loss": 1.1931, + "step": 300 + }, + { + "epoch": 0.7797927461139896, + "grad_norm": 70.51218412614058, + "learning_rate": 2.878799019318283e-05, + "loss": 1.155, + "step": 301 + }, + { + "epoch": 0.7823834196891192, + "grad_norm": 80.27969224752457, + "learning_rate": 2.8710803845484955e-05, + "loss": 1.1425, + "step": 302 + }, + { + "epoch": 0.7849740932642487, + "grad_norm": 28.16560857981767, + "learning_rate": 2.8633457049475678e-05, + "loss": 1.1072, + "step": 303 + }, + { + "epoch": 0.7875647668393783, + "grad_norm": 41.15138307552231, + "learning_rate": 2.855595122984129e-05, + "loss": 1.1492, + "step": 304 + }, + { + "epoch": 0.7901554404145078, + "grad_norm": 23.894217282116276, + "learning_rate": 2.847828781419722e-05, + "loss": 1.1136, + "step": 305 + }, + { + "epoch": 0.7927461139896373, + "grad_norm": 25.005501120810248, + "learning_rate": 2.8400468233061708e-05, + "loss": 1.0921, + "step": 306 + }, + { + "epoch": 0.7953367875647669, + "grad_norm": 30.91791938195468, + "learning_rate": 2.832249391982949e-05, + "loss": 1.1098, + "step": 307 + }, + { + "epoch": 0.7979274611398963, + "grad_norm": 44.776563922922726, + "learning_rate": 2.8244366310745398e-05, + "loss": 1.1845, + "step": 308 + }, + { + "epoch": 0.8005181347150259, + "grad_norm": 19.059329544784376, + "learning_rate": 2.816608684487787e-05, + "loss": 1.169, + "step": 309 + }, + { + "epoch": 0.8031088082901554, + "grad_norm": 63.97334641962602, + "learning_rate": 2.8087656964092472e-05, + "loss": 1.124, + "step": 310 + }, + { + "epoch": 0.805699481865285, + "grad_norm": 30.878848859015882, + "learning_rate": 2.8009078113025335e-05, + "loss": 1.2087, + "step": 311 + }, + { + "epoch": 0.8082901554404145, + "grad_norm": 34.63835471543836, + "learning_rate": 2.7930351739056533e-05, + "loss": 1.1338, + "step": 312 + }, + { + "epoch": 0.810880829015544, + "grad_norm": 30.03178182445718, + "learning_rate": 2.7851479292283442e-05, + "loss": 1.1321, + "step": 313 + }, + { + "epoch": 0.8134715025906736, + "grad_norm": 38.42236523356876, + "learning_rate": 2.7772462225494013e-05, + "loss": 1.1557, + "step": 314 + }, + { + "epoch": 0.8160621761658031, + "grad_norm": 39.179683790956744, + "learning_rate": 2.7693301994140026e-05, + "loss": 1.1201, + "step": 315 + }, + { + "epoch": 0.8186528497409327, + "grad_norm": 38.32243159447327, + "learning_rate": 2.761400005631028e-05, + "loss": 1.1105, + "step": 316 + }, + { + "epoch": 0.8212435233160622, + "grad_norm": 39.913808227411835, + "learning_rate": 2.7534557872703705e-05, + "loss": 1.1598, + "step": 317 + }, + { + "epoch": 0.8238341968911918, + "grad_norm": 69.73521867812421, + "learning_rate": 2.7454976906602513e-05, + "loss": 1.1145, + "step": 318 + }, + { + "epoch": 0.8264248704663213, + "grad_norm": 65.55887588207746, + "learning_rate": 2.7375258623845207e-05, + "loss": 1.1255, + "step": 319 + }, + { + "epoch": 0.8290155440414507, + "grad_norm": 30.980111545641563, + "learning_rate": 2.7295404492799575e-05, + "loss": 1.122, + "step": 320 + }, + { + "epoch": 0.8316062176165803, + "grad_norm": 30.12179911444832, + "learning_rate": 2.721541598433567e-05, + "loss": 1.113, + "step": 321 + }, + { + "epoch": 0.8341968911917098, + "grad_norm": 28.329434659508582, + "learning_rate": 2.7135294571798706e-05, + "loss": 1.0498, + "step": 322 + }, + { + "epoch": 0.8367875647668394, + "grad_norm": 25.114787597049578, + "learning_rate": 2.70550417309819e-05, + "loss": 1.0633, + "step": 323 + }, + { + "epoch": 0.8393782383419689, + "grad_norm": 27.754037709590385, + "learning_rate": 2.6974658940099337e-05, + "loss": 1.1585, + "step": 324 + }, + { + "epoch": 0.8419689119170984, + "grad_norm": 29.489888159179444, + "learning_rate": 2.6894147679758678e-05, + "loss": 1.1259, + "step": 325 + }, + { + "epoch": 0.844559585492228, + "grad_norm": 24.426102194202898, + "learning_rate": 2.6813509432933957e-05, + "loss": 1.1515, + "step": 326 + }, + { + "epoch": 0.8471502590673575, + "grad_norm": 24.75197483331429, + "learning_rate": 2.673274568493821e-05, + "loss": 1.15, + "step": 327 + }, + { + "epoch": 0.8497409326424871, + "grad_norm": 40.604864626683366, + "learning_rate": 2.6651857923396132e-05, + "loss": 1.1219, + "step": 328 + }, + { + "epoch": 0.8523316062176166, + "grad_norm": 34.694568404196026, + "learning_rate": 2.6570847638216698e-05, + "loss": 1.103, + "step": 329 + }, + { + "epoch": 0.8549222797927462, + "grad_norm": 48.715136403425035, + "learning_rate": 2.648971632156569e-05, + "loss": 1.1675, + "step": 330 + }, + { + "epoch": 0.8575129533678757, + "grad_norm": 97.77526410121799, + "learning_rate": 2.6408465467838225e-05, + "loss": 1.1502, + "step": 331 + }, + { + "epoch": 0.8601036269430051, + "grad_norm": 54.697215318949276, + "learning_rate": 2.632709657363124e-05, + "loss": 1.1446, + "step": 332 + }, + { + "epoch": 0.8626943005181347, + "grad_norm": 38.09192002041798, + "learning_rate": 2.6245611137715897e-05, + "loss": 1.1333, + "step": 333 + }, + { + "epoch": 0.8652849740932642, + "grad_norm": 46.713623556984956, + "learning_rate": 2.6164010661010007e-05, + "loss": 1.1252, + "step": 334 + }, + { + "epoch": 0.8678756476683938, + "grad_norm": 46.40552686286593, + "learning_rate": 2.6082296646550364e-05, + "loss": 1.121, + "step": 335 + }, + { + "epoch": 0.8704663212435233, + "grad_norm": 37.57424454065957, + "learning_rate": 2.6000470599465065e-05, + "loss": 1.1671, + "step": 336 + }, + { + "epoch": 0.8730569948186528, + "grad_norm": 38.580777053099204, + "learning_rate": 2.5918534026945787e-05, + "loss": 1.0849, + "step": 337 + }, + { + "epoch": 0.8756476683937824, + "grad_norm": 154.3106712010981, + "learning_rate": 2.5836488438220044e-05, + "loss": 1.0663, + "step": 338 + }, + { + "epoch": 0.8782383419689119, + "grad_norm": 34.21394067951015, + "learning_rate": 2.575433534452334e-05, + "loss": 1.0895, + "step": 339 + }, + { + "epoch": 0.8808290155440415, + "grad_norm": 36.291611242733886, + "learning_rate": 2.5672076259071385e-05, + "loss": 1.1242, + "step": 340 + }, + { + "epoch": 0.883419689119171, + "grad_norm": 29.411623389655112, + "learning_rate": 2.558971269703219e-05, + "loss": 1.1005, + "step": 341 + }, + { + "epoch": 0.8860103626943006, + "grad_norm": 30.24903086761753, + "learning_rate": 2.5507246175498174e-05, + "loss": 1.1134, + "step": 342 + }, + { + "epoch": 0.8886010362694301, + "grad_norm": 22.032293114161938, + "learning_rate": 2.5424678213458202e-05, + "loss": 1.1121, + "step": 343 + }, + { + "epoch": 0.8911917098445595, + "grad_norm": 34.997361528376956, + "learning_rate": 2.5342010331769635e-05, + "loss": 1.1341, + "step": 344 + }, + { + "epoch": 0.8937823834196891, + "grad_norm": 28.212824875732352, + "learning_rate": 2.5259244053130295e-05, + "loss": 1.0748, + "step": 345 + }, + { + "epoch": 0.8963730569948186, + "grad_norm": 23.870011592985897, + "learning_rate": 2.5176380902050418e-05, + "loss": 1.0643, + "step": 346 + }, + { + "epoch": 0.8989637305699482, + "grad_norm": 26.10018699309748, + "learning_rate": 2.5093422404824574e-05, + "loss": 1.1662, + "step": 347 + }, + { + "epoch": 0.9015544041450777, + "grad_norm": 30.191468778559166, + "learning_rate": 2.5010370089503578e-05, + "loss": 1.1023, + "step": 348 + }, + { + "epoch": 0.9041450777202072, + "grad_norm": 55.799581973427415, + "learning_rate": 2.4927225485866297e-05, + "loss": 1.1538, + "step": 349 + }, + { + "epoch": 0.9067357512953368, + "grad_norm": 35.7030284720465, + "learning_rate": 2.4843990125391516e-05, + "loss": 1.1, + "step": 350 + }, + { + "epoch": 0.9093264248704663, + "grad_norm": 28.61763302791738, + "learning_rate": 2.4760665541229712e-05, + "loss": 1.0914, + "step": 351 + }, + { + "epoch": 0.9119170984455959, + "grad_norm": 33.34233685155311, + "learning_rate": 2.467725326817481e-05, + "loss": 1.0862, + "step": 352 + }, + { + "epoch": 0.9145077720207254, + "grad_norm": 25.441052078480084, + "learning_rate": 2.4593754842635917e-05, + "loss": 1.1422, + "step": 353 + }, + { + "epoch": 0.917098445595855, + "grad_norm": 24.217974454985058, + "learning_rate": 2.451017180260902e-05, + "loss": 1.132, + "step": 354 + }, + { + "epoch": 0.9196891191709845, + "grad_norm": 57.986011465793155, + "learning_rate": 2.4426505687648653e-05, + "loss": 1.2082, + "step": 355 + }, + { + "epoch": 0.9222797927461139, + "grad_norm": 34.058264716876195, + "learning_rate": 2.4342758038839573e-05, + "loss": 1.1679, + "step": 356 + }, + { + "epoch": 0.9248704663212435, + "grad_norm": 28.621514922275253, + "learning_rate": 2.4258930398768317e-05, + "loss": 1.1319, + "step": 357 + }, + { + "epoch": 0.927461139896373, + "grad_norm": 35.33355417283227, + "learning_rate": 2.4175024311494835e-05, + "loss": 1.0705, + "step": 358 + }, + { + "epoch": 0.9300518134715026, + "grad_norm": 46.579572933583265, + "learning_rate": 2.4091041322524023e-05, + "loss": 1.0842, + "step": 359 + }, + { + "epoch": 0.9326424870466321, + "grad_norm": 35.494740787672974, + "learning_rate": 2.4006982978777263e-05, + "loss": 1.1072, + "step": 360 + }, + { + "epoch": 0.9352331606217616, + "grad_norm": 44.56606839509262, + "learning_rate": 2.392285082856394e-05, + "loss": 1.1125, + "step": 361 + }, + { + "epoch": 0.9378238341968912, + "grad_norm": 46.26363869084929, + "learning_rate": 2.3838646421552917e-05, + "loss": 1.1268, + "step": 362 + }, + { + "epoch": 0.9404145077720207, + "grad_norm": 89.17676267680146, + "learning_rate": 2.3754371308743975e-05, + "loss": 1.0893, + "step": 363 + }, + { + "epoch": 0.9430051813471503, + "grad_norm": 34.87700187494181, + "learning_rate": 2.367002704243927e-05, + "loss": 1.1203, + "step": 364 + }, + { + "epoch": 0.9455958549222798, + "grad_norm": 32.92806939217504, + "learning_rate": 2.3585615176214716e-05, + "loss": 1.1488, + "step": 365 + }, + { + "epoch": 0.9481865284974094, + "grad_norm": 27.27458755248548, + "learning_rate": 2.3501137264891396e-05, + "loss": 1.0874, + "step": 366 + }, + { + "epoch": 0.9507772020725389, + "grad_norm": 24.959123789739834, + "learning_rate": 2.3416594864506887e-05, + "loss": 1.1783, + "step": 367 + }, + { + "epoch": 0.9533678756476683, + "grad_norm": 31.838670988369724, + "learning_rate": 2.333198953228664e-05, + "loss": 1.0759, + "step": 368 + }, + { + "epoch": 0.9559585492227979, + "grad_norm": 28.112870222863155, + "learning_rate": 2.3247322826615276e-05, + "loss": 1.1481, + "step": 369 + }, + { + "epoch": 0.9585492227979274, + "grad_norm": 35.08461098450067, + "learning_rate": 2.316259630700787e-05, + "loss": 1.0953, + "step": 370 + }, + { + "epoch": 0.961139896373057, + "grad_norm": 37.80899503618479, + "learning_rate": 2.307781153408124e-05, + "loss": 1.1224, + "step": 371 + }, + { + "epoch": 0.9637305699481865, + "grad_norm": 31.644978122007387, + "learning_rate": 2.2992970069525202e-05, + "loss": 1.1608, + "step": 372 + }, + { + "epoch": 0.966321243523316, + "grad_norm": 23.51029318210938, + "learning_rate": 2.29080734760738e-05, + "loss": 1.0914, + "step": 373 + }, + { + "epoch": 0.9689119170984456, + "grad_norm": 28.97240481418573, + "learning_rate": 2.2823123317476522e-05, + "loss": 1.1117, + "step": 374 + }, + { + "epoch": 0.9715025906735751, + "grad_norm": 36.613893678320395, + "learning_rate": 2.273812115846951e-05, + "loss": 1.1118, + "step": 375 + }, + { + "epoch": 0.9740932642487047, + "grad_norm": 26.402979304578093, + "learning_rate": 2.2653068564746692e-05, + "loss": 1.13, + "step": 376 + }, + { + "epoch": 0.9766839378238342, + "grad_norm": 114.3000444613392, + "learning_rate": 2.2567967102931025e-05, + "loss": 1.1539, + "step": 377 + }, + { + "epoch": 0.9792746113989638, + "grad_norm": 26.861359932396834, + "learning_rate": 2.2482818340545534e-05, + "loss": 1.0566, + "step": 378 + }, + { + "epoch": 0.9818652849740933, + "grad_norm": 32.75509374223994, + "learning_rate": 2.2397623845984548e-05, + "loss": 1.1746, + "step": 379 + }, + { + "epoch": 0.9844559585492227, + "grad_norm": 34.11964206838379, + "learning_rate": 2.2312385188484718e-05, + "loss": 1.0834, + "step": 380 + }, + { + "epoch": 0.9870466321243523, + "grad_norm": 38.019564122226434, + "learning_rate": 2.2227103938096176e-05, + "loss": 1.1074, + "step": 381 + }, + { + "epoch": 0.9896373056994818, + "grad_norm": 39.5073811375391, + "learning_rate": 2.2141781665653584e-05, + "loss": 1.1082, + "step": 382 + }, + { + "epoch": 0.9922279792746114, + "grad_norm": 298.4258332795163, + "learning_rate": 2.205641994274721e-05, + "loss": 1.125, + "step": 383 + }, + { + "epoch": 0.9948186528497409, + "grad_norm": 36.444415670935506, + "learning_rate": 2.1971020341693973e-05, + "loss": 1.0935, + "step": 384 + }, + { + "epoch": 0.9974093264248705, + "grad_norm": 28.96533429210575, + "learning_rate": 2.188558443550849e-05, + "loss": 1.0957, + "step": 385 + }, + { + "epoch": 1.0, + "grad_norm": 66.41241684127401, + "learning_rate": 2.180011379787411e-05, + "loss": 1.1335, + "step": 386 + }, + { + "epoch": 1.0025906735751295, + "grad_norm": 28.75549619538953, + "learning_rate": 2.1714610003113887e-05, + "loss": 1.1316, + "step": 387 + }, + { + "epoch": 1.005181347150259, + "grad_norm": 26.911837500852275, + "learning_rate": 2.1629074626161647e-05, + "loss": 1.1026, + "step": 388 + }, + { + "epoch": 1.005181347150259, + "eval_loss": 1.0908173322677612, + "eval_runtime": 37.7642, + "eval_samples_per_second": 19.701, + "eval_steps_per_second": 1.245, + "step": 388 + }, + { + "epoch": 1.0077720207253886, + "grad_norm": 34.28722746775385, + "learning_rate": 2.1543509242532932e-05, + "loss": 1.1104, + "step": 389 + }, + { + "epoch": 1.0103626943005182, + "grad_norm": 37.97709310694863, + "learning_rate": 2.145791542829597e-05, + "loss": 1.0663, + "step": 390 + }, + { + "epoch": 1.0129533678756477, + "grad_norm": 39.379668162327384, + "learning_rate": 2.1372294760042686e-05, + "loss": 1.1405, + "step": 391 + }, + { + "epoch": 1.0155440414507773, + "grad_norm": 27.136201219298698, + "learning_rate": 2.1286648814859636e-05, + "loss": 1.0963, + "step": 392 + }, + { + "epoch": 1.0181347150259068, + "grad_norm": 39.34261641469313, + "learning_rate": 2.120097917029897e-05, + "loss": 1.1276, + "step": 393 + }, + { + "epoch": 1.0207253886010363, + "grad_norm": 46.77583801285328, + "learning_rate": 2.1115287404349357e-05, + "loss": 1.1171, + "step": 394 + }, + { + "epoch": 1.0233160621761659, + "grad_norm": 55.10335066695868, + "learning_rate": 2.1029575095406933e-05, + "loss": 1.0831, + "step": 395 + }, + { + "epoch": 1.0259067357512954, + "grad_norm": 76.88533851789373, + "learning_rate": 2.0943843822246234e-05, + "loss": 1.0925, + "step": 396 + }, + { + "epoch": 1.028497409326425, + "grad_norm": 29.604569209708462, + "learning_rate": 2.0858095163991094e-05, + "loss": 1.1259, + "step": 397 + }, + { + "epoch": 1.0310880829015545, + "grad_norm": 37.71348366628868, + "learning_rate": 2.077233070008557e-05, + "loss": 1.0792, + "step": 398 + }, + { + "epoch": 1.0336787564766838, + "grad_norm": 26.866133194031644, + "learning_rate": 2.0686552010264872e-05, + "loss": 1.1649, + "step": 399 + }, + { + "epoch": 1.0362694300518134, + "grad_norm": 35.739274800620635, + "learning_rate": 2.060076067452622e-05, + "loss": 1.0837, + "step": 400 + }, + { + "epoch": 1.038860103626943, + "grad_norm": 24.479129391259896, + "learning_rate": 2.0514958273099778e-05, + "loss": 1.073, + "step": 401 + }, + { + "epoch": 1.0414507772020725, + "grad_norm": 50.49963650108008, + "learning_rate": 2.042914638641952e-05, + "loss": 1.0912, + "step": 402 + }, + { + "epoch": 1.044041450777202, + "grad_norm": 35.6875451072032, + "learning_rate": 2.0343326595094154e-05, + "loss": 1.0936, + "step": 403 + }, + { + "epoch": 1.0466321243523315, + "grad_norm": 30.212298193414487, + "learning_rate": 2.0257500479877965e-05, + "loss": 1.089, + "step": 404 + }, + { + "epoch": 1.049222797927461, + "grad_norm": 28.65828720015124, + "learning_rate": 2.0171669621641743e-05, + "loss": 1.1727, + "step": 405 + }, + { + "epoch": 1.0518134715025906, + "grad_norm": 39.2199058392425, + "learning_rate": 2.0085835601343627e-05, + "loss": 1.1493, + "step": 406 + }, + { + "epoch": 1.0544041450777202, + "grad_norm": 110.01204177059546, + "learning_rate": 2e-05, + "loss": 1.1245, + "step": 407 + }, + { + "epoch": 1.0569948186528497, + "grad_norm": 43.427381349600374, + "learning_rate": 1.9914164398656383e-05, + "loss": 1.1183, + "step": 408 + }, + { + "epoch": 1.0595854922279793, + "grad_norm": 64.78768909817894, + "learning_rate": 1.9828330378358264e-05, + "loss": 1.1528, + "step": 409 + }, + { + "epoch": 1.0621761658031088, + "grad_norm": 26.50257915912425, + "learning_rate": 1.974249952012204e-05, + "loss": 1.1568, + "step": 410 + }, + { + "epoch": 1.0647668393782384, + "grad_norm": 27.63159204178893, + "learning_rate": 1.9656673404905852e-05, + "loss": 1.1071, + "step": 411 + }, + { + "epoch": 1.067357512953368, + "grad_norm": 27.0795355533723, + "learning_rate": 1.957085361358049e-05, + "loss": 1.0809, + "step": 412 + }, + { + "epoch": 1.0699481865284974, + "grad_norm": 41.84795332660821, + "learning_rate": 1.9485041726900232e-05, + "loss": 1.0744, + "step": 413 + }, + { + "epoch": 1.072538860103627, + "grad_norm": 143.2109134427192, + "learning_rate": 1.939923932547379e-05, + "loss": 1.0905, + "step": 414 + }, + { + "epoch": 1.0751295336787565, + "grad_norm": 89.55384065946154, + "learning_rate": 1.931344798973513e-05, + "loss": 1.1012, + "step": 415 + }, + { + "epoch": 1.077720207253886, + "grad_norm": 31.072074793068015, + "learning_rate": 1.922766929991443e-05, + "loss": 1.1141, + "step": 416 + }, + { + "epoch": 1.0803108808290156, + "grad_norm": 29.82683189045969, + "learning_rate": 1.914190483600891e-05, + "loss": 1.0842, + "step": 417 + }, + { + "epoch": 1.0829015544041452, + "grad_norm": 30.09708662586305, + "learning_rate": 1.9056156177753776e-05, + "loss": 1.1088, + "step": 418 + }, + { + "epoch": 1.0854922279792747, + "grad_norm": 27.637437518920503, + "learning_rate": 1.897042490459307e-05, + "loss": 1.058, + "step": 419 + }, + { + "epoch": 1.0880829015544042, + "grad_norm": 69.34285700381683, + "learning_rate": 1.8884712595650653e-05, + "loss": 1.0314, + "step": 420 + }, + { + "epoch": 1.0906735751295338, + "grad_norm": 25.644927284592956, + "learning_rate": 1.8799020829701036e-05, + "loss": 1.0916, + "step": 421 + }, + { + "epoch": 1.093264248704663, + "grad_norm": 30.3898986852319, + "learning_rate": 1.871335118514037e-05, + "loss": 1.0797, + "step": 422 + }, + { + "epoch": 1.0958549222797926, + "grad_norm": 22.271334693423444, + "learning_rate": 1.862770523995732e-05, + "loss": 1.1134, + "step": 423 + }, + { + "epoch": 1.0984455958549222, + "grad_norm": 35.85874616678876, + "learning_rate": 1.854208457170404e-05, + "loss": 1.0927, + "step": 424 + }, + { + "epoch": 1.1010362694300517, + "grad_norm": 43.06832041948097, + "learning_rate": 1.8456490757467075e-05, + "loss": 1.093, + "step": 425 + }, + { + "epoch": 1.1036269430051813, + "grad_norm": 37.83777637993467, + "learning_rate": 1.8370925373838356e-05, + "loss": 1.1268, + "step": 426 + }, + { + "epoch": 1.1062176165803108, + "grad_norm": 23.798059023605177, + "learning_rate": 1.8285389996886113e-05, + "loss": 1.0989, + "step": 427 + }, + { + "epoch": 1.1088082901554404, + "grad_norm": 25.443104465500795, + "learning_rate": 1.8199886202125897e-05, + "loss": 1.0581, + "step": 428 + }, + { + "epoch": 1.11139896373057, + "grad_norm": 23.76241444847441, + "learning_rate": 1.8114415564491513e-05, + "loss": 1.0908, + "step": 429 + }, + { + "epoch": 1.1139896373056994, + "grad_norm": 26.5600693044426, + "learning_rate": 1.8028979658306033e-05, + "loss": 1.1321, + "step": 430 + }, + { + "epoch": 1.116580310880829, + "grad_norm": 44.854375199828986, + "learning_rate": 1.794358005725279e-05, + "loss": 1.0762, + "step": 431 + }, + { + "epoch": 1.1191709844559585, + "grad_norm": 28.05797777410846, + "learning_rate": 1.785821833434642e-05, + "loss": 1.0698, + "step": 432 + }, + { + "epoch": 1.121761658031088, + "grad_norm": 26.488479630212364, + "learning_rate": 1.7772896061903824e-05, + "loss": 1.1223, + "step": 433 + }, + { + "epoch": 1.1243523316062176, + "grad_norm": 32.77084542157883, + "learning_rate": 1.768761481151529e-05, + "loss": 1.0984, + "step": 434 + }, + { + "epoch": 1.1269430051813472, + "grad_norm": 39.13198413130026, + "learning_rate": 1.7602376154015456e-05, + "loss": 1.1551, + "step": 435 + }, + { + "epoch": 1.1295336787564767, + "grad_norm": 23.878966995283953, + "learning_rate": 1.751718165945447e-05, + "loss": 1.1133, + "step": 436 + }, + { + "epoch": 1.1321243523316062, + "grad_norm": 33.90472985566232, + "learning_rate": 1.743203289706898e-05, + "loss": 1.1219, + "step": 437 + }, + { + "epoch": 1.1347150259067358, + "grad_norm": 23.340369938533712, + "learning_rate": 1.734693143525331e-05, + "loss": 1.1244, + "step": 438 + }, + { + "epoch": 1.1373056994818653, + "grad_norm": 105.6885206147852, + "learning_rate": 1.7261878841530494e-05, + "loss": 1.0788, + "step": 439 + }, + { + "epoch": 1.1398963730569949, + "grad_norm": 28.453526076458317, + "learning_rate": 1.717687668252348e-05, + "loss": 1.1576, + "step": 440 + }, + { + "epoch": 1.1424870466321244, + "grad_norm": 36.1473991485961, + "learning_rate": 1.7091926523926205e-05, + "loss": 1.0859, + "step": 441 + }, + { + "epoch": 1.145077720207254, + "grad_norm": 27.043461146902448, + "learning_rate": 1.7007029930474804e-05, + "loss": 1.1072, + "step": 442 + }, + { + "epoch": 1.1476683937823835, + "grad_norm": 28.066170619981435, + "learning_rate": 1.6922188465918763e-05, + "loss": 1.1279, + "step": 443 + }, + { + "epoch": 1.150259067357513, + "grad_norm": 38.62445822837212, + "learning_rate": 1.6837403692992136e-05, + "loss": 1.1275, + "step": 444 + }, + { + "epoch": 1.1528497409326426, + "grad_norm": 28.077258963587767, + "learning_rate": 1.6752677173384734e-05, + "loss": 1.1004, + "step": 445 + }, + { + "epoch": 1.1554404145077721, + "grad_norm": 42.1405744301338, + "learning_rate": 1.6668010467713363e-05, + "loss": 1.1141, + "step": 446 + }, + { + "epoch": 1.1580310880829017, + "grad_norm": 26.827291684301034, + "learning_rate": 1.658340513549312e-05, + "loss": 1.1216, + "step": 447 + }, + { + "epoch": 1.160621761658031, + "grad_norm": 30.863489441619983, + "learning_rate": 1.649886273510861e-05, + "loss": 1.1898, + "step": 448 + }, + { + "epoch": 1.1632124352331605, + "grad_norm": 27.73579733476068, + "learning_rate": 1.641438482378529e-05, + "loss": 1.0971, + "step": 449 + }, + { + "epoch": 1.16580310880829, + "grad_norm": 32.84347174567353, + "learning_rate": 1.6329972957560736e-05, + "loss": 1.0579, + "step": 450 + }, + { + "epoch": 1.1683937823834196, + "grad_norm": 30.06456192962641, + "learning_rate": 1.6245628691256032e-05, + "loss": 1.1057, + "step": 451 + }, + { + "epoch": 1.1709844559585492, + "grad_norm": 36.554506394377846, + "learning_rate": 1.616135357844709e-05, + "loss": 1.1008, + "step": 452 + }, + { + "epoch": 1.1735751295336787, + "grad_norm": 27.358643056184114, + "learning_rate": 1.6077149171436063e-05, + "loss": 1.101, + "step": 453 + }, + { + "epoch": 1.1761658031088082, + "grad_norm": 111.13373813893604, + "learning_rate": 1.599301702122274e-05, + "loss": 1.0688, + "step": 454 + }, + { + "epoch": 1.1787564766839378, + "grad_norm": 33.94168250727336, + "learning_rate": 1.590895867747599e-05, + "loss": 1.0721, + "step": 455 + }, + { + "epoch": 1.1813471502590673, + "grad_norm": 53.93978395349692, + "learning_rate": 1.582497568850517e-05, + "loss": 1.0584, + "step": 456 + }, + { + "epoch": 1.1839378238341969, + "grad_norm": 29.19245794937285, + "learning_rate": 1.574106960123169e-05, + "loss": 1.067, + "step": 457 + }, + { + "epoch": 1.1865284974093264, + "grad_norm": 28.06897801999048, + "learning_rate": 1.5657241961160434e-05, + "loss": 1.0899, + "step": 458 + }, + { + "epoch": 1.189119170984456, + "grad_norm": 52.31256652964293, + "learning_rate": 1.557349431235135e-05, + "loss": 1.0925, + "step": 459 + }, + { + "epoch": 1.1917098445595855, + "grad_norm": 65.39771110845307, + "learning_rate": 1.5489828197390988e-05, + "loss": 1.1448, + "step": 460 + }, + { + "epoch": 1.194300518134715, + "grad_norm": 27.062780348557254, + "learning_rate": 1.5406245157364093e-05, + "loss": 1.0871, + "step": 461 + }, + { + "epoch": 1.1968911917098446, + "grad_norm": 41.667025056250424, + "learning_rate": 1.5322746731825195e-05, + "loss": 1.048, + "step": 462 + }, + { + "epoch": 1.1994818652849741, + "grad_norm": 24.936669803360665, + "learning_rate": 1.5239334458770291e-05, + "loss": 1.1243, + "step": 463 + }, + { + "epoch": 1.2020725388601037, + "grad_norm": 26.65392149600558, + "learning_rate": 1.5156009874608484e-05, + "loss": 1.0919, + "step": 464 + }, + { + "epoch": 1.2046632124352332, + "grad_norm": 48.57730651937978, + "learning_rate": 1.5072774514133708e-05, + "loss": 1.1259, + "step": 465 + }, + { + "epoch": 1.2072538860103628, + "grad_norm": 31.34891257114439, + "learning_rate": 1.4989629910496424e-05, + "loss": 1.0733, + "step": 466 + }, + { + "epoch": 1.2098445595854923, + "grad_norm": 24.541559850584985, + "learning_rate": 1.4906577595175428e-05, + "loss": 1.1166, + "step": 467 + }, + { + "epoch": 1.2124352331606219, + "grad_norm": 20.4345832961354, + "learning_rate": 1.4823619097949584e-05, + "loss": 1.0916, + "step": 468 + }, + { + "epoch": 1.2150259067357512, + "grad_norm": 28.860712194727487, + "learning_rate": 1.4740755946869708e-05, + "loss": 1.1043, + "step": 469 + }, + { + "epoch": 1.2176165803108807, + "grad_norm": 25.71820242946282, + "learning_rate": 1.4657989668230363e-05, + "loss": 1.0949, + "step": 470 + }, + { + "epoch": 1.2202072538860103, + "grad_norm": 51.16994773097077, + "learning_rate": 1.4575321786541801e-05, + "loss": 1.141, + "step": 471 + }, + { + "epoch": 1.2227979274611398, + "grad_norm": 32.70442309640389, + "learning_rate": 1.4492753824501833e-05, + "loss": 1.1127, + "step": 472 + }, + { + "epoch": 1.2253886010362693, + "grad_norm": 21.913285172411495, + "learning_rate": 1.4410287302967813e-05, + "loss": 1.084, + "step": 473 + }, + { + "epoch": 1.2279792746113989, + "grad_norm": 34.45727214001296, + "learning_rate": 1.4327923740928613e-05, + "loss": 1.0836, + "step": 474 + }, + { + "epoch": 1.2305699481865284, + "grad_norm": 26.768013926034776, + "learning_rate": 1.4245664655476663e-05, + "loss": 1.1264, + "step": 475 + }, + { + "epoch": 1.233160621761658, + "grad_norm": 28.401965255935572, + "learning_rate": 1.4163511561779956e-05, + "loss": 1.0805, + "step": 476 + }, + { + "epoch": 1.2357512953367875, + "grad_norm": 29.19935757288793, + "learning_rate": 1.4081465973054216e-05, + "loss": 1.0825, + "step": 477 + }, + { + "epoch": 1.238341968911917, + "grad_norm": 24.55918541541201, + "learning_rate": 1.3999529400534941e-05, + "loss": 1.1164, + "step": 478 + }, + { + "epoch": 1.2409326424870466, + "grad_norm": 25.35635406268312, + "learning_rate": 1.3917703353449646e-05, + "loss": 1.1334, + "step": 479 + }, + { + "epoch": 1.2435233160621761, + "grad_norm": 45.453901005004184, + "learning_rate": 1.3835989338989996e-05, + "loss": 1.1387, + "step": 480 + }, + { + "epoch": 1.2461139896373057, + "grad_norm": 21.67852694202104, + "learning_rate": 1.375438886228411e-05, + "loss": 1.0846, + "step": 481 + }, + { + "epoch": 1.2487046632124352, + "grad_norm": 171.2474074894732, + "learning_rate": 1.3672903426368773e-05, + "loss": 1.1388, + "step": 482 + }, + { + "epoch": 1.2512953367875648, + "grad_norm": 43.18223835070906, + "learning_rate": 1.3591534532161781e-05, + "loss": 1.1483, + "step": 483 + }, + { + "epoch": 1.2538860103626943, + "grad_norm": 29.447332565856644, + "learning_rate": 1.3510283678434317e-05, + "loss": 1.07, + "step": 484 + }, + { + "epoch": 1.2564766839378239, + "grad_norm": 28.600251051615228, + "learning_rate": 1.3429152361783307e-05, + "loss": 1.0798, + "step": 485 + }, + { + "epoch": 1.2564766839378239, + "eval_loss": 1.085669755935669, + "eval_runtime": 38.1134, + "eval_samples_per_second": 19.521, + "eval_steps_per_second": 1.233, + "step": 485 + }, + { + "epoch": 1.2590673575129534, + "grad_norm": 47.124643074410464, + "learning_rate": 1.3348142076603876e-05, + "loss": 1.0875, + "step": 486 + }, + { + "epoch": 1.261658031088083, + "grad_norm": 42.06019726307143, + "learning_rate": 1.3267254315061797e-05, + "loss": 1.1429, + "step": 487 + }, + { + "epoch": 1.2642487046632125, + "grad_norm": 18.950734630756962, + "learning_rate": 1.318649056706605e-05, + "loss": 1.0747, + "step": 488 + }, + { + "epoch": 1.266839378238342, + "grad_norm": 31.903949502516806, + "learning_rate": 1.3105852320241326e-05, + "loss": 1.1041, + "step": 489 + }, + { + "epoch": 1.2694300518134716, + "grad_norm": 22.957473008085927, + "learning_rate": 1.3025341059900675e-05, + "loss": 1.1046, + "step": 490 + }, + { + "epoch": 1.2720207253886011, + "grad_norm": 22.325983256563678, + "learning_rate": 1.2944958269018103e-05, + "loss": 1.0643, + "step": 491 + }, + { + "epoch": 1.2746113989637307, + "grad_norm": 29.689383331974955, + "learning_rate": 1.2864705428201307e-05, + "loss": 1.0949, + "step": 492 + }, + { + "epoch": 1.2772020725388602, + "grad_norm": 25.338298442945575, + "learning_rate": 1.2784584015664337e-05, + "loss": 1.0725, + "step": 493 + }, + { + "epoch": 1.2797927461139897, + "grad_norm": 31.591732488078588, + "learning_rate": 1.2704595507200435e-05, + "loss": 1.0347, + "step": 494 + }, + { + "epoch": 1.2823834196891193, + "grad_norm": 42.96243570696118, + "learning_rate": 1.26247413761548e-05, + "loss": 1.1196, + "step": 495 + }, + { + "epoch": 1.2849740932642488, + "grad_norm": 26.559546676266024, + "learning_rate": 1.254502309339749e-05, + "loss": 1.0187, + "step": 496 + }, + { + "epoch": 1.2875647668393784, + "grad_norm": 27.58444017584016, + "learning_rate": 1.2465442127296297e-05, + "loss": 1.0985, + "step": 497 + }, + { + "epoch": 1.2901554404145077, + "grad_norm": 36.53028730423797, + "learning_rate": 1.2385999943689732e-05, + "loss": 1.068, + "step": 498 + }, + { + "epoch": 1.2927461139896372, + "grad_norm": 38.94837307599113, + "learning_rate": 1.2306698005859975e-05, + "loss": 1.0736, + "step": 499 + }, + { + "epoch": 1.2953367875647668, + "grad_norm": 36.67208266195125, + "learning_rate": 1.2227537774505996e-05, + "loss": 1.119, + "step": 500 + }, + { + "epoch": 1.2979274611398963, + "grad_norm": 31.086410648635283, + "learning_rate": 1.2148520707716567e-05, + "loss": 1.1094, + "step": 501 + }, + { + "epoch": 1.3005181347150259, + "grad_norm": 27.96977481605826, + "learning_rate": 1.2069648260943473e-05, + "loss": 1.1345, + "step": 502 + }, + { + "epoch": 1.3031088082901554, + "grad_norm": 22.89450502840197, + "learning_rate": 1.1990921886974669e-05, + "loss": 1.12, + "step": 503 + }, + { + "epoch": 1.305699481865285, + "grad_norm": 18.54206032224653, + "learning_rate": 1.1912343035907535e-05, + "loss": 1.0929, + "step": 504 + }, + { + "epoch": 1.3082901554404145, + "grad_norm": 38.9386007237313, + "learning_rate": 1.1833913155122132e-05, + "loss": 1.1381, + "step": 505 + }, + { + "epoch": 1.310880829015544, + "grad_norm": 37.05899458809635, + "learning_rate": 1.1755633689254609e-05, + "loss": 1.0535, + "step": 506 + }, + { + "epoch": 1.3134715025906736, + "grad_norm": 27.716372794195156, + "learning_rate": 1.1677506080170512e-05, + "loss": 1.1342, + "step": 507 + }, + { + "epoch": 1.3160621761658031, + "grad_norm": 40.42306246079416, + "learning_rate": 1.1599531766938306e-05, + "loss": 1.0887, + "step": 508 + }, + { + "epoch": 1.3186528497409327, + "grad_norm": 98.56681767405578, + "learning_rate": 1.1521712185802789e-05, + "loss": 1.0954, + "step": 509 + }, + { + "epoch": 1.3212435233160622, + "grad_norm": 34.42816933350743, + "learning_rate": 1.1444048770158718e-05, + "loss": 1.0512, + "step": 510 + }, + { + "epoch": 1.3238341968911918, + "grad_norm": 52.457523653614096, + "learning_rate": 1.136654295052433e-05, + "loss": 1.1599, + "step": 511 + }, + { + "epoch": 1.3264248704663213, + "grad_norm": 26.832339531661276, + "learning_rate": 1.1289196154515048e-05, + "loss": 1.0602, + "step": 512 + }, + { + "epoch": 1.3290155440414508, + "grad_norm": 32.746047673769816, + "learning_rate": 1.1212009806817163e-05, + "loss": 1.1544, + "step": 513 + }, + { + "epoch": 1.3316062176165804, + "grad_norm": 37.44483451702055, + "learning_rate": 1.1134985329161608e-05, + "loss": 1.1421, + "step": 514 + }, + { + "epoch": 1.33419689119171, + "grad_norm": 28.625976525737606, + "learning_rate": 1.1058124140297718e-05, + "loss": 1.0858, + "step": 515 + }, + { + "epoch": 1.3367875647668392, + "grad_norm": 38.64141195246213, + "learning_rate": 1.0981427655967183e-05, + "loss": 1.0983, + "step": 516 + }, + { + "epoch": 1.3393782383419688, + "grad_norm": 29.989753893533425, + "learning_rate": 1.0904897288877891e-05, + "loss": 1.1269, + "step": 517 + }, + { + "epoch": 1.3419689119170983, + "grad_norm": 48.63990665515511, + "learning_rate": 1.0828534448677942e-05, + "loss": 1.0844, + "step": 518 + }, + { + "epoch": 1.3445595854922279, + "grad_norm": 25.477227318250847, + "learning_rate": 1.0752340541929711e-05, + "loss": 1.0742, + "step": 519 + }, + { + "epoch": 1.3471502590673574, + "grad_norm": 26.363588814537763, + "learning_rate": 1.0676316972083867e-05, + "loss": 1.0533, + "step": 520 + }, + { + "epoch": 1.349740932642487, + "grad_norm": 34.59968737708606, + "learning_rate": 1.060046513945361e-05, + "loss": 1.0983, + "step": 521 + }, + { + "epoch": 1.3523316062176165, + "grad_norm": 52.51652561846762, + "learning_rate": 1.0524786441188786e-05, + "loss": 1.1319, + "step": 522 + }, + { + "epoch": 1.354922279792746, + "grad_norm": 21.360221214301127, + "learning_rate": 1.0449282271250239e-05, + "loss": 1.0627, + "step": 523 + }, + { + "epoch": 1.3575129533678756, + "grad_norm": 37.00053933682603, + "learning_rate": 1.0373954020384073e-05, + "loss": 1.096, + "step": 524 + }, + { + "epoch": 1.3601036269430051, + "grad_norm": 39.212240822687484, + "learning_rate": 1.029880307609608e-05, + "loss": 1.0512, + "step": 525 + }, + { + "epoch": 1.3626943005181347, + "grad_norm": 24.89842378385804, + "learning_rate": 1.0223830822626124e-05, + "loss": 1.0538, + "step": 526 + }, + { + "epoch": 1.3652849740932642, + "grad_norm": 29.14416894424653, + "learning_rate": 1.0149038640922715e-05, + "loss": 1.1538, + "step": 527 + }, + { + "epoch": 1.3678756476683938, + "grad_norm": 31.688722122648855, + "learning_rate": 1.0074427908617515e-05, + "loss": 1.171, + "step": 528 + }, + { + "epoch": 1.3704663212435233, + "grad_norm": 41.918909004413734, + "learning_rate": 1.0000000000000006e-05, + "loss": 1.1203, + "step": 529 + }, + { + "epoch": 1.3730569948186528, + "grad_norm": 26.70963454516576, + "learning_rate": 9.92575628599213e-06, + "loss": 1.0855, + "step": 530 + }, + { + "epoch": 1.3756476683937824, + "grad_norm": 24.819351173466824, + "learning_rate": 9.851698134123095e-06, + "loss": 1.0972, + "step": 531 + }, + { + "epoch": 1.378238341968912, + "grad_norm": 22.100465399566815, + "learning_rate": 9.777826908504126e-06, + "loss": 1.08, + "step": 532 + }, + { + "epoch": 1.3808290155440415, + "grad_norm": 29.31574709406259, + "learning_rate": 9.704143969803392e-06, + "loss": 1.0835, + "step": 533 + }, + { + "epoch": 1.383419689119171, + "grad_norm": 25.551326748473052, + "learning_rate": 9.630650675220892e-06, + "loss": 1.0396, + "step": 534 + }, + { + "epoch": 1.3860103626943006, + "grad_norm": 59.07595627892596, + "learning_rate": 9.557348378463503e-06, + "loss": 1.0814, + "step": 535 + }, + { + "epoch": 1.38860103626943, + "grad_norm": 24.96501978981908, + "learning_rate": 9.484238429720018e-06, + "loss": 1.0187, + "step": 536 + }, + { + "epoch": 1.3911917098445596, + "grad_norm": 42.530604702279234, + "learning_rate": 9.411322175636298e-06, + "loss": 1.074, + "step": 537 + }, + { + "epoch": 1.3937823834196892, + "grad_norm": 34.91129065632851, + "learning_rate": 9.338600959290414e-06, + "loss": 1.0878, + "step": 538 + }, + { + "epoch": 1.3963730569948187, + "grad_norm": 32.07525956876426, + "learning_rate": 9.266076120167992e-06, + "loss": 1.0962, + "step": 539 + }, + { + "epoch": 1.3989637305699483, + "grad_norm": 40.18387743296675, + "learning_rate": 9.193748994137462e-06, + "loss": 1.1033, + "step": 540 + }, + { + "epoch": 1.4015544041450778, + "grad_norm": 66.68031460980451, + "learning_rate": 9.121620913425508e-06, + "loss": 1.1466, + "step": 541 + }, + { + "epoch": 1.4041450777202074, + "grad_norm": 34.07506059584738, + "learning_rate": 9.04969320659249e-06, + "loss": 1.1184, + "step": 542 + }, + { + "epoch": 1.406735751295337, + "grad_norm": 17.130845779169075, + "learning_rate": 8.977967198508001e-06, + "loss": 1.0803, + "step": 543 + }, + { + "epoch": 1.4093264248704664, + "grad_norm": 22.4457025132615, + "learning_rate": 8.906444210326441e-06, + "loss": 1.0745, + "step": 544 + }, + { + "epoch": 1.411917098445596, + "grad_norm": 73.43971735356851, + "learning_rate": 8.83512555946271e-06, + "loss": 1.0717, + "step": 545 + }, + { + "epoch": 1.4145077720207253, + "grad_norm": 38.16321297719761, + "learning_rate": 8.764012559567899e-06, + "loss": 1.1371, + "step": 546 + }, + { + "epoch": 1.4170984455958548, + "grad_norm": 56.14718024907725, + "learning_rate": 8.693106520505147e-06, + "loss": 1.0185, + "step": 547 + }, + { + "epoch": 1.4196891191709844, + "grad_norm": 53.3812598790062, + "learning_rate": 8.622408748325461e-06, + "loss": 1.0859, + "step": 548 + }, + { + "epoch": 1.422279792746114, + "grad_norm": 39.69041631433326, + "learning_rate": 8.551920545243704e-06, + "loss": 1.1146, + "step": 549 + }, + { + "epoch": 1.4248704663212435, + "grad_norm": 24.099260758984773, + "learning_rate": 8.481643209614576e-06, + "loss": 1.0968, + "step": 550 + }, + { + "epoch": 1.427461139896373, + "grad_norm": 22.623850373369237, + "learning_rate": 8.411578035908728e-06, + "loss": 1.0642, + "step": 551 + }, + { + "epoch": 1.4300518134715026, + "grad_norm": 25.343746374404027, + "learning_rate": 8.341726314688875e-06, + "loss": 1.0815, + "step": 552 + }, + { + "epoch": 1.432642487046632, + "grad_norm": 35.82641011588973, + "learning_rate": 8.272089332586089e-06, + "loss": 1.1012, + "step": 553 + }, + { + "epoch": 1.4352331606217616, + "grad_norm": 24.81161215784662, + "learning_rate": 8.20266837227603e-06, + "loss": 1.1086, + "step": 554 + }, + { + "epoch": 1.4378238341968912, + "grad_norm": 54.18243481591251, + "learning_rate": 8.133464712455364e-06, + "loss": 1.0704, + "step": 555 + }, + { + "epoch": 1.4404145077720207, + "grad_norm": 23.602598217141395, + "learning_rate": 8.064479627818213e-06, + "loss": 1.1519, + "step": 556 + }, + { + "epoch": 1.4430051813471503, + "grad_norm": 31.124404868409982, + "learning_rate": 7.995714389032638e-06, + "loss": 1.0705, + "step": 557 + }, + { + "epoch": 1.4455958549222798, + "grad_norm": 24.14171016995626, + "learning_rate": 7.927170262717284e-06, + "loss": 1.1083, + "step": 558 + }, + { + "epoch": 1.4481865284974094, + "grad_norm": 47.987203109917175, + "learning_rate": 7.858848511417998e-06, + "loss": 1.0836, + "step": 559 + }, + { + "epoch": 1.450777202072539, + "grad_norm": 25.871447098066056, + "learning_rate": 7.790750393584616e-06, + "loss": 1.0787, + "step": 560 + }, + { + "epoch": 1.4533678756476685, + "grad_norm": 23.820249113937482, + "learning_rate": 7.72287716354776e-06, + "loss": 1.1165, + "step": 561 + }, + { + "epoch": 1.455958549222798, + "grad_norm": 48.04131308947624, + "learning_rate": 7.65523007149575e-06, + "loss": 1.0819, + "step": 562 + }, + { + "epoch": 1.4585492227979275, + "grad_norm": 29.273494083692352, + "learning_rate": 7.587810363451544e-06, + "loss": 1.0302, + "step": 563 + }, + { + "epoch": 1.4611398963730569, + "grad_norm": 120.01571222366722, + "learning_rate": 7.5206192812498345e-06, + "loss": 1.1291, + "step": 564 + }, + { + "epoch": 1.4637305699481864, + "grad_norm": 33.16947662083338, + "learning_rate": 7.4536580625141244e-06, + "loss": 1.0842, + "step": 565 + }, + { + "epoch": 1.466321243523316, + "grad_norm": 29.979556378166713, + "learning_rate": 7.386927940633981e-06, + "loss": 1.1116, + "step": 566 + }, + { + "epoch": 1.4689119170984455, + "grad_norm": 27.172344859281896, + "learning_rate": 7.32043014474227e-06, + "loss": 1.0676, + "step": 567 + }, + { + "epoch": 1.471502590673575, + "grad_norm": 30.208548637757318, + "learning_rate": 7.254165899692554e-06, + "loss": 1.1104, + "step": 568 + }, + { + "epoch": 1.4740932642487046, + "grad_norm": 19.385421184583773, + "learning_rate": 7.188136426036498e-06, + "loss": 1.0085, + "step": 569 + }, + { + "epoch": 1.4766839378238341, + "grad_norm": 30.350787749309685, + "learning_rate": 7.12234294000143e-06, + "loss": 1.0584, + "step": 570 + }, + { + "epoch": 1.4792746113989637, + "grad_norm": 31.520305600900198, + "learning_rate": 7.056786653467882e-06, + "loss": 1.0831, + "step": 571 + }, + { + "epoch": 1.4818652849740932, + "grad_norm": 46.13006972574487, + "learning_rate": 6.991468773947321e-06, + "loss": 1.1761, + "step": 572 + }, + { + "epoch": 1.4844559585492227, + "grad_norm": 26.72340868362835, + "learning_rate": 6.926390504559879e-06, + "loss": 1.0605, + "step": 573 + }, + { + "epoch": 1.4870466321243523, + "grad_norm": 25.992965411102556, + "learning_rate": 6.861553044012206e-06, + "loss": 1.1015, + "step": 574 + }, + { + "epoch": 1.4896373056994818, + "grad_norm": 38.60187420279626, + "learning_rate": 6.796957586575364e-06, + "loss": 1.1232, + "step": 575 + }, + { + "epoch": 1.4922279792746114, + "grad_norm": 21.7618591565717, + "learning_rate": 6.732605322062869e-06, + "loss": 1.1196, + "step": 576 + }, + { + "epoch": 1.494818652849741, + "grad_norm": 28.233093007170996, + "learning_rate": 6.668497435808736e-06, + "loss": 1.1451, + "step": 577 + }, + { + "epoch": 1.4974093264248705, + "grad_norm": 28.061514297823816, + "learning_rate": 6.604635108645683e-06, + "loss": 1.0832, + "step": 578 + }, + { + "epoch": 1.5, + "grad_norm": 35.34503147975386, + "learning_rate": 6.5410195168833425e-06, + "loss": 1.118, + "step": 579 + } + ], + "logging_steps": 1, + "max_steps": 772, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 193, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0022991232499712e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-579/training_args.bin b/checkpoint-579/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a4d661b15e5bbd8390fd11a502bea76680041301 --- /dev/null +++ b/checkpoint-579/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fe76c44cf1ade69372a2b861f80cfcfc5ba88f283683f660a4a0605f642aee3 +size 8568 diff --git a/checkpoint-579/zero_to_fp32.py b/checkpoint-579/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-579/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-618/README.md b/checkpoint-618/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5d958b0327f8e51ca223e270eb789387f6b41fb2 --- /dev/null +++ b/checkpoint-618/README.md @@ -0,0 +1,202 @@ +--- +base_model: THUDM/GLM-4-32B-0414 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/checkpoint-618/adapter_config.json b/checkpoint-618/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d23c5bb0164ae65157b73dbb2e6dc419d09b28ad --- /dev/null +++ b/checkpoint-618/adapter_config.json @@ -0,0 +1,41 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "THUDM/GLM-4-32B-0414", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_up_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": true +} \ No newline at end of file diff --git a/checkpoint-618/adapter_model.safetensors b/checkpoint-618/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..33fd135a729971587398eb81df85b84a291ab4dc --- /dev/null +++ b/checkpoint-618/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ac6338f4da6e13c15823f24e19a271d8fc65df47f503bcfc14c8766e14ef0bc +size 5579575888 diff --git a/checkpoint-618/global_step617/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-618/global_step617/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a191af173b723477e42b0a79af6ddab2af6fb19c --- /dev/null +++ b/checkpoint-618/global_step617/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2313d9207c50abb87475adf598a5d6fd3d3d02d22a7b8852855f6dd1abd67977 +size 2458601314 diff --git a/checkpoint-618/global_step617/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-618/global_step617/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e26be5b394465290ffffb20cfd48342f4a3b9c19 --- /dev/null +++ b/checkpoint-618/global_step617/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbeb45abbdd046dc63cd170c5bde2c73d64227159812131ab33ec194a12b170a +size 2458601314 diff --git a/checkpoint-618/global_step617/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-618/global_step617/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..843d35b523ca76fd1d85413c4ce50465be4372b1 --- /dev/null +++ b/checkpoint-618/global_step617/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b84e269a78f25aad5152fd4bdef3ec4e6635ed14d6d08702d3f5e125a14ac28c +size 2458601314 diff --git a/checkpoint-618/global_step617/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-618/global_step617/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..87b8ad101a0fcd3fa5b32fd26360252d3edc77f4 --- /dev/null +++ b/checkpoint-618/global_step617/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:215071f0888a851073f0f40280a8aca3d352c7cc71c16f9c407988c78fcfa8f7 +size 2458601314 diff --git a/checkpoint-618/global_step617/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-618/global_step617/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5d4a4d1e0e0887a6a7d7f801c73150fec2793ec --- /dev/null +++ b/checkpoint-618/global_step617/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a73d035178aead156f96b9f208248db39158203b83ef9aed34d8ec3c6b174236 +size 2458601314 diff --git a/checkpoint-618/global_step617/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-618/global_step617/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c614ae3828df91d62a62c1c93450bb83342b11c1 --- /dev/null +++ b/checkpoint-618/global_step617/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ee07b54f69db0b9e3953e949e178cc87592e3eaa1a845556be656befaaed324 +size 2458601314 diff --git a/checkpoint-618/global_step617/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-618/global_step617/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..37a860fddeaffb071a6f42a5bf2de6cefd1a328c --- /dev/null +++ b/checkpoint-618/global_step617/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de7012e2b66a6f72edf266b4e20132b482c19058bb357065fb8aa7f0314c069b +size 2458601314 diff --git a/checkpoint-618/global_step617/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-618/global_step617/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2af90a6ee085d81fe0119d7767a2e116eca3205f --- /dev/null +++ b/checkpoint-618/global_step617/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9a9d463c8c32fe26cb6ceb06f636a283e30cc9d24c48080d6ef467ea2ba506d +size 2458601314 diff --git a/checkpoint-618/global_step617/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-618/global_step617/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a94c0321b519c597a0c94bc9cb4e9370ca2b765e --- /dev/null +++ b/checkpoint-618/global_step617/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:258423171c32e9d6cc74b23612ceb2e89e38df0d91d795457185e713a7644523 +size 752148 diff --git a/checkpoint-618/global_step617/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-618/global_step617/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae518370a60f71674df5904752ba108a45154fe4 --- /dev/null +++ b/checkpoint-618/global_step617/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9856c3fab0d45b7a84d519c6a5781e65fe4d56e33855490b163619887fb91be +size 752148 diff --git a/checkpoint-618/global_step617/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-618/global_step617/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ad81e3065870e7d7ac7488f5289b5165736ba9c --- /dev/null +++ b/checkpoint-618/global_step617/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2580444416d0f9c1ba57df79e9e685673596d1ad95a07002eb4da35198eb4c0d +size 752148 diff --git a/checkpoint-618/global_step617/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-618/global_step617/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..57d3de63b3c7899b8abb561a3992fe65ecb19ca0 --- /dev/null +++ b/checkpoint-618/global_step617/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e69b821888fc1affcbcc9b822d45a27588b805eb45eceefe64b5afea53beac0b +size 752148 diff --git a/checkpoint-618/global_step617/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-618/global_step617/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e44cf635aa0308f463f0e4f646e20bb29030acdf --- /dev/null +++ b/checkpoint-618/global_step617/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ba8e1d5086335ef4cd1bb701d69c884685bc094916d51534e55a2a9aa06693 +size 752148 diff --git a/checkpoint-618/global_step617/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-618/global_step617/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..850a94913244cb47e8ac73cb6a237a2578c77831 --- /dev/null +++ b/checkpoint-618/global_step617/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe34e372dedacdb33571ab2b4347da88102b5a7930be9126607a44d798627a52 +size 752148 diff --git a/checkpoint-618/global_step617/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-618/global_step617/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d01209e2f8c1a36487754beef6fb0ba7ea473a84 --- /dev/null +++ b/checkpoint-618/global_step617/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4c432819caff1085cbf70acadac49f8169a1ab6bbb00b8b4c71c97577c34f02 +size 752148 diff --git a/checkpoint-618/global_step617/zero_pp_rank_7_mp_rank_00_model_states.pt b/checkpoint-618/global_step617/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c6cb3c270e8abba3cc7a3cd5b42413f4aaa99b5 --- /dev/null +++ b/checkpoint-618/global_step617/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9ca8f6fbaf4df3d5f83c349c9b21bc45b5976a02529ffea751083000bde298e +size 752148 diff --git a/checkpoint-618/latest b/checkpoint-618/latest new file mode 100644 index 0000000000000000000000000000000000000000..e3591976dc02bcf876afbbae81c6b15992f0b7cc --- /dev/null +++ b/checkpoint-618/latest @@ -0,0 +1 @@ +global_step617 \ No newline at end of file diff --git a/checkpoint-618/rng_state_0.pth b/checkpoint-618/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..21af2edb36d5dc2f0f272356f08666b8ba46404d --- /dev/null +++ b/checkpoint-618/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:139ab52a8b7888bb2b6bc57022d15e95bde7158a58f3b96c075a46432bd804f7 +size 15984 diff --git a/checkpoint-618/rng_state_1.pth b/checkpoint-618/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..fa67ef3f64f0b3e19384d6312b6150bf6c01d9c7 --- /dev/null +++ b/checkpoint-618/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9ac300c23c0f0222dfbcfb4e7bf191c1b9c07f7e759e1445653318a00154087 +size 15984 diff --git a/checkpoint-618/rng_state_2.pth b/checkpoint-618/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9e61d76ca61b2704bb56bc0c9216ddac760d3cfd --- /dev/null +++ b/checkpoint-618/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df0b8aef61e966c433db5c300020f4cc5f72210c72167d63a56502cf32efde2e +size 15984 diff --git a/checkpoint-618/rng_state_3.pth b/checkpoint-618/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..8822c9d82969c59d7e0e39285330ac97a4afbd62 --- /dev/null +++ b/checkpoint-618/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa9b368f180513edf8a6b276c52fad1cee0a5669655c79fd700779055266cbdc +size 15984 diff --git a/checkpoint-618/rng_state_4.pth b/checkpoint-618/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..9999feb8ddfc2a1800d467e6ba26eee2216bbf50 --- /dev/null +++ b/checkpoint-618/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a7949cc8c56130fc5744bf28cb582eadff20beb2a6046521867b30d17e2db36 +size 15984 diff --git a/checkpoint-618/rng_state_5.pth b/checkpoint-618/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..31e754380a3988fb4e0a4cc3f3dc27ebea4e2534 --- /dev/null +++ b/checkpoint-618/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e91cc570010bc761d4607a41efb25dcbbe25b17376ef2b2f9f56979e35df8a6 +size 15984 diff --git a/checkpoint-618/rng_state_6.pth b/checkpoint-618/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..8a2c4060c498d2f9af4a1b7515a9626929ff400f --- /dev/null +++ b/checkpoint-618/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d62293b3776c3666306e46e8e4089019cc3a093559b478e0523e85fcf1f00c09 +size 15984 diff --git a/checkpoint-618/rng_state_7.pth b/checkpoint-618/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..c7572f4efdf870301e9beaedf8f8837610435241 --- /dev/null +++ b/checkpoint-618/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83d5c6cfa7ad06775825ebbc35e15d8b73c7ef8a7546a855de02ef444f52ab9d +size 15984 diff --git a/checkpoint-618/scheduler.pt b/checkpoint-618/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..14d3f0cae23c97a8fa62dba8839bd6d30fab58e6 --- /dev/null +++ b/checkpoint-618/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a9c98ae9f0c0fe9c37ce7e2650666ea461d75f496b952bbbb48a98af405c315 +size 1064 diff --git a/checkpoint-618/special_tokens_map.json b/checkpoint-618/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3cf953c2c8a3c89778c92e54c685942bb1130616 --- /dev/null +++ b/checkpoint-618/special_tokens_map.json @@ -0,0 +1,32 @@ +{ + "additional_special_tokens": [ + "<|endoftext|>", + "[MASK]", + "[gMASK]", + "[sMASK]", + "", + "", + "<|system|>", + "<|user|>", + "<|assistant|>", + "<|observation|>", + "<|begin_of_image|>", + "<|end_of_image|>", + "<|begin_of_video|>", + "<|end_of_video|>" + ], + "eos_token": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-618/tokenizer.json b/checkpoint-618/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..4d1dde37a2715c11628fc84bf571976f9f80eb69 --- /dev/null +++ b/checkpoint-618/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76ebeac0d8bd7879ead7b43c16b44981f277e47225de2bd7de9ae1a6cc664a8c +size 19966496 diff --git a/checkpoint-618/tokenizer_config.json b/checkpoint-618/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e19c45f6310a17f6c1c6be76a429ac68438d547f --- /dev/null +++ b/checkpoint-618/tokenizer_config.json @@ -0,0 +1,146 @@ +{ + "added_tokens_decoder": { + "151329": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151330": { + "content": "[MASK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151331": { + "content": "[gMASK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151332": { + "content": "[sMASK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151333": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151334": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151335": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151336": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151337": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151338": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151339": { + "content": "<|begin_of_image|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151340": { + "content": "<|end_of_image|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151341": { + "content": "<|begin_of_video|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151342": { + "content": "<|end_of_video|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|endoftext|>", + "[MASK]", + "[gMASK]", + "[sMASK]", + "", + "", + "<|system|>", + "<|user|>", + "<|assistant|>", + "<|observation|>", + "<|begin_of_image|>", + "<|end_of_image|>", + "<|begin_of_video|>", + "<|end_of_video|>" + ], + "chat_template": "[gMASK]\n{%- if tools -%}\n<|system|>\n# 可用工具\n{% for tool in tools %}\n {%- set function = tool.function if tool.get(\"function\") else tool %}\n\n## {{ function.name }}\n\n{{ function | tojson(indent=4, ensure_ascii=False) }}\n在调用上述函数时,请使用 Json 格式表示调用的参数。\n{%- endfor %}\n{%- endif -%}\n\n{%- for msg in messages %}\n {%- if msg.role == 'system' %}\n<|system|>\n{{ msg.content }}\n {%- endif %}\n{%- endfor %}\n\n{%- for message in messages if message.role != 'system' %}\n {%- set role = message['role'] %}\n {%- set content = message['content'] %}\n {%- set meta = message.get(\"metadata\", \"\") %}\n\n {%- if role == 'user' %}\n<|user|>\n{{ content }}\n {%- elif role == 'assistant' and not meta %}\n<|assistant|>\n{{ content }}\n {%- elif role == 'assistant' and meta %}\n<|assistant|>{{ meta }}\n{{ content }}\n {%- elif role == 'observation' %}\n<|observation|>\n{{ content }}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}<|assistant|>{% endif %}", + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "eos_token": "<|user|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 128000, + "pad_token": "<|endoftext|>", + "padding_side": "left", + "remove_space": false, + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-618/trainer_state.json b/checkpoint-618/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bb3e3e7b728d3492f804c2962a06bcb51c0c8c38 --- /dev/null +++ b/checkpoint-618/trainer_state.json @@ -0,0 +1,4416 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.5006075334143378, + "eval_steps": 103, + "global_step": 618, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002430133657351154, + "grad_norm": 715.4923219036787, + "learning_rate": 0.0, + "loss": 1.3541, + "step": 1 + }, + { + "epoch": 0.002430133657351154, + "eval_loss": 1.3335719108581543, + "eval_runtime": 53.4883, + "eval_samples_per_second": 13.91, + "eval_steps_per_second": 1.739, + "step": 1 + }, + { + "epoch": 0.004860267314702308, + "grad_norm": 614.6970578314867, + "learning_rate": 5e-06, + "loss": 1.3775, + "step": 2 + }, + { + "epoch": 0.007290400972053463, + "grad_norm": 471.59017991123795, + "learning_rate": 1e-05, + "loss": 1.339, + "step": 3 + }, + { + "epoch": 0.009720534629404616, + "grad_norm": 238.72216262259653, + "learning_rate": 1.5e-05, + "loss": 1.3829, + "step": 4 + }, + { + "epoch": 0.012150668286755772, + "grad_norm": 355.68955726709873, + "learning_rate": 2e-05, + "loss": 1.3597, + "step": 5 + }, + { + "epoch": 0.014580801944106925, + "grad_norm": 414.5627284272111, + "learning_rate": 2.5e-05, + "loss": 1.3862, + "step": 6 + }, + { + "epoch": 0.01701093560145808, + "grad_norm": 534.9877222052693, + "learning_rate": 3e-05, + "loss": 1.2784, + "step": 7 + }, + { + "epoch": 0.019441069258809233, + "grad_norm": 153.38895635666677, + "learning_rate": 3.5e-05, + "loss": 1.3521, + "step": 8 + }, + { + "epoch": 0.02187120291616039, + "grad_norm": 858.293734138087, + "learning_rate": 4e-05, + "loss": 1.2461, + "step": 9 + }, + { + "epoch": 0.024301336573511544, + "grad_norm": 255.81989388533376, + "learning_rate": 4.5e-05, + "loss": 1.2778, + "step": 10 + }, + { + "epoch": 0.026731470230862697, + "grad_norm": 368.91949003479226, + "learning_rate": 5e-05, + "loss": 1.3412, + "step": 11 + }, + { + "epoch": 0.02916160388821385, + "grad_norm": 176.49481799555898, + "learning_rate": 5.500000000000001e-05, + "loss": 1.3437, + "step": 12 + }, + { + "epoch": 0.031591737545565005, + "grad_norm": 208.57742104974147, + "learning_rate": 6e-05, + "loss": 1.2859, + "step": 13 + }, + { + "epoch": 0.03402187120291616, + "grad_norm": 93.26742036471734, + "learning_rate": 6.500000000000001e-05, + "loss": 1.1843, + "step": 14 + }, + { + "epoch": 0.03645200486026731, + "grad_norm": 145.53380444622215, + "learning_rate": 7e-05, + "loss": 1.4281, + "step": 15 + }, + { + "epoch": 0.038882138517618466, + "grad_norm": 126.56724937430516, + "learning_rate": 7.500000000000001e-05, + "loss": 1.3908, + "step": 16 + }, + { + "epoch": 0.041312272174969626, + "grad_norm": 106.19246390662754, + "learning_rate": 8e-05, + "loss": 1.344, + "step": 17 + }, + { + "epoch": 0.04374240583232078, + "grad_norm": 289.348178084847, + "learning_rate": 8.5e-05, + "loss": 1.2708, + "step": 18 + }, + { + "epoch": 0.046172539489671933, + "grad_norm": 286.63676887065634, + "learning_rate": 9e-05, + "loss": 1.3564, + "step": 19 + }, + { + "epoch": 0.04860267314702309, + "grad_norm": 269.6096299101413, + "learning_rate": 9.5e-05, + "loss": 1.2184, + "step": 20 + }, + { + "epoch": 0.05103280680437424, + "grad_norm": 151.28678796160915, + "learning_rate": 0.0001, + "loss": 1.2974, + "step": 21 + }, + { + "epoch": 0.053462940461725394, + "grad_norm": 265.5625538646362, + "learning_rate": 0.000105, + "loss": 1.2703, + "step": 22 + }, + { + "epoch": 0.05589307411907655, + "grad_norm": 724.7157187586193, + "learning_rate": 0.00011000000000000002, + "loss": 1.2691, + "step": 23 + }, + { + "epoch": 0.0583232077764277, + "grad_norm": 425.3768239347252, + "learning_rate": 0.00011499999999999999, + "loss": 1.375, + "step": 24 + }, + { + "epoch": 0.060753341433778855, + "grad_norm": 314.5119318308783, + "learning_rate": 0.00012, + "loss": 1.2952, + "step": 25 + }, + { + "epoch": 0.06318347509113001, + "grad_norm": 557.519173033834, + "learning_rate": 0.000125, + "loss": 1.2923, + "step": 26 + }, + { + "epoch": 0.06561360874848117, + "grad_norm": 211.4069356529637, + "learning_rate": 0.00013000000000000002, + "loss": 1.2629, + "step": 27 + }, + { + "epoch": 0.06804374240583232, + "grad_norm": 299.7742653722713, + "learning_rate": 0.00013500000000000003, + "loss": 1.3099, + "step": 28 + }, + { + "epoch": 0.07047387606318348, + "grad_norm": 182.18551965886013, + "learning_rate": 0.00014, + "loss": 1.2215, + "step": 29 + }, + { + "epoch": 0.07290400972053462, + "grad_norm": 153.38300520125887, + "learning_rate": 0.000145, + "loss": 1.2799, + "step": 30 + }, + { + "epoch": 0.07533414337788578, + "grad_norm": 849.4472853252786, + "learning_rate": 0.00015000000000000001, + "loss": 1.2012, + "step": 31 + }, + { + "epoch": 0.07776427703523693, + "grad_norm": 179.94814586965418, + "learning_rate": 0.000155, + "loss": 1.2103, + "step": 32 + }, + { + "epoch": 0.08019441069258809, + "grad_norm": 180.36681057956048, + "learning_rate": 0.00016, + "loss": 1.2414, + "step": 33 + }, + { + "epoch": 0.08262454434993925, + "grad_norm": 113.72852454032189, + "learning_rate": 0.000165, + "loss": 1.2508, + "step": 34 + }, + { + "epoch": 0.0850546780072904, + "grad_norm": 150.53415363213057, + "learning_rate": 0.00017, + "loss": 1.2528, + "step": 35 + }, + { + "epoch": 0.08748481166464156, + "grad_norm": 156.19567878683574, + "learning_rate": 0.000175, + "loss": 1.2016, + "step": 36 + }, + { + "epoch": 0.0899149453219927, + "grad_norm": 416.34884765145057, + "learning_rate": 0.00018, + "loss": 1.254, + "step": 37 + }, + { + "epoch": 0.09234507897934387, + "grad_norm": 269.7105025581372, + "learning_rate": 0.00018500000000000002, + "loss": 1.2215, + "step": 38 + }, + { + "epoch": 0.09477521263669501, + "grad_norm": 249.35069047655023, + "learning_rate": 0.00019, + "loss": 1.2078, + "step": 39 + }, + { + "epoch": 0.09720534629404617, + "grad_norm": 167.16896045613478, + "learning_rate": 0.000195, + "loss": 1.1866, + "step": 40 + }, + { + "epoch": 0.09963547995139732, + "grad_norm": 248.22240554128427, + "learning_rate": 0.0002, + "loss": 1.252, + "step": 41 + }, + { + "epoch": 0.10206561360874848, + "grad_norm": 180.89520841022969, + "learning_rate": 0.0001999991930332148, + "loss": 1.2251, + "step": 42 + }, + { + "epoch": 0.10449574726609964, + "grad_norm": 614.4291375430485, + "learning_rate": 0.00019999677214588312, + "loss": 1.2563, + "step": 43 + }, + { + "epoch": 0.10692588092345079, + "grad_norm": 211.7523427355369, + "learning_rate": 0.00019999273737707646, + "loss": 1.193, + "step": 44 + }, + { + "epoch": 0.10935601458080195, + "grad_norm": 181.56788458769344, + "learning_rate": 0.00019998708879191335, + "loss": 1.2598, + "step": 45 + }, + { + "epoch": 0.1117861482381531, + "grad_norm": 157.5783414916277, + "learning_rate": 0.00019997982648155814, + "loss": 1.2663, + "step": 46 + }, + { + "epoch": 0.11421628189550426, + "grad_norm": 155.78006251192625, + "learning_rate": 0.00019997095056321971, + "loss": 1.1637, + "step": 47 + }, + { + "epoch": 0.1166464155528554, + "grad_norm": 202.0253360488958, + "learning_rate": 0.00019996046118014955, + "loss": 1.2508, + "step": 48 + }, + { + "epoch": 0.11907654921020656, + "grad_norm": 192.7576297264874, + "learning_rate": 0.00019994835850163924, + "loss": 1.2014, + "step": 49 + }, + { + "epoch": 0.12150668286755771, + "grad_norm": 132.5484871621418, + "learning_rate": 0.00019993464272301804, + "loss": 1.2279, + "step": 50 + }, + { + "epoch": 0.12393681652490887, + "grad_norm": 128.32285438248965, + "learning_rate": 0.00019991931406564944, + "loss": 1.2179, + "step": 51 + }, + { + "epoch": 0.12636695018226002, + "grad_norm": 552.3669463716512, + "learning_rate": 0.00019990237277692788, + "loss": 1.1498, + "step": 52 + }, + { + "epoch": 0.12879708383961117, + "grad_norm": 86.17911790260192, + "learning_rate": 0.00019988381913027442, + "loss": 1.2784, + "step": 53 + }, + { + "epoch": 0.13122721749696234, + "grad_norm": 70.83294605515782, + "learning_rate": 0.00019986365342513265, + "loss": 1.2224, + "step": 54 + }, + { + "epoch": 0.1336573511543135, + "grad_norm": 45.23624563299466, + "learning_rate": 0.00019984187598696363, + "loss": 1.1746, + "step": 55 + }, + { + "epoch": 0.13608748481166463, + "grad_norm": 57.67645735585192, + "learning_rate": 0.00019981848716724073, + "loss": 1.2154, + "step": 56 + }, + { + "epoch": 0.1385176184690158, + "grad_norm": 45.661268047129674, + "learning_rate": 0.00019979348734344398, + "loss": 1.1411, + "step": 57 + }, + { + "epoch": 0.14094775212636695, + "grad_norm": 53.10628399970359, + "learning_rate": 0.00019976687691905393, + "loss": 1.2029, + "step": 58 + }, + { + "epoch": 0.1433778857837181, + "grad_norm": 38.71353325803162, + "learning_rate": 0.00019973865632354516, + "loss": 1.1976, + "step": 59 + }, + { + "epoch": 0.14580801944106925, + "grad_norm": 42.789208063581114, + "learning_rate": 0.0001997088260123793, + "loss": 1.1477, + "step": 60 + }, + { + "epoch": 0.14823815309842042, + "grad_norm": 37.613194740192164, + "learning_rate": 0.0001996773864669978, + "loss": 1.2529, + "step": 61 + }, + { + "epoch": 0.15066828675577157, + "grad_norm": 47.96813084127655, + "learning_rate": 0.00019964433819481405, + "loss": 1.2328, + "step": 62 + }, + { + "epoch": 0.15309842041312272, + "grad_norm": 55.30483872428545, + "learning_rate": 0.00019960968172920516, + "loss": 1.1996, + "step": 63 + }, + { + "epoch": 0.15552855407047386, + "grad_norm": 35.58995799070749, + "learning_rate": 0.00019957341762950344, + "loss": 1.1248, + "step": 64 + }, + { + "epoch": 0.15795868772782504, + "grad_norm": 58.86131222300149, + "learning_rate": 0.00019953554648098748, + "loss": 1.3017, + "step": 65 + }, + { + "epoch": 0.16038882138517618, + "grad_norm": 32.12091331878439, + "learning_rate": 0.00019949606889487233, + "loss": 1.1961, + "step": 66 + }, + { + "epoch": 0.16281895504252733, + "grad_norm": 167.27433996357928, + "learning_rate": 0.0001994549855083001, + "loss": 1.1768, + "step": 67 + }, + { + "epoch": 0.1652490886998785, + "grad_norm": 32.3328494297432, + "learning_rate": 0.0001994122969843293, + "loss": 1.1802, + "step": 68 + }, + { + "epoch": 0.16767922235722965, + "grad_norm": 39.92530074438497, + "learning_rate": 0.0001993680040119244, + "loss": 1.2098, + "step": 69 + }, + { + "epoch": 0.1701093560145808, + "grad_norm": 45.60830517129956, + "learning_rate": 0.0001993221073059445, + "loss": 1.2159, + "step": 70 + }, + { + "epoch": 0.17253948967193194, + "grad_norm": 35.462695032736335, + "learning_rate": 0.00019927460760713197, + "loss": 1.1818, + "step": 71 + }, + { + "epoch": 0.17496962332928312, + "grad_norm": 43.05751624597826, + "learning_rate": 0.0001992255056821004, + "loss": 1.2011, + "step": 72 + }, + { + "epoch": 0.17739975698663427, + "grad_norm": 47.13143404969894, + "learning_rate": 0.00019917480232332224, + "loss": 1.1669, + "step": 73 + }, + { + "epoch": 0.1798298906439854, + "grad_norm": 72.07146401418987, + "learning_rate": 0.000199122498349116, + "loss": 1.181, + "step": 74 + }, + { + "epoch": 0.1822600243013366, + "grad_norm": 36.289202348834955, + "learning_rate": 0.00019906859460363307, + "loss": 1.1787, + "step": 75 + }, + { + "epoch": 0.18469015795868773, + "grad_norm": 46.92636167228936, + "learning_rate": 0.00019901309195684416, + "loss": 1.2316, + "step": 76 + }, + { + "epoch": 0.18712029161603888, + "grad_norm": 31.71425340357504, + "learning_rate": 0.00019895599130452505, + "loss": 1.1607, + "step": 77 + }, + { + "epoch": 0.18955042527339003, + "grad_norm": 43.94199928621344, + "learning_rate": 0.00019889729356824235, + "loss": 1.1919, + "step": 78 + }, + { + "epoch": 0.1919805589307412, + "grad_norm": 45.33073791860179, + "learning_rate": 0.0001988369996953386, + "loss": 1.2237, + "step": 79 + }, + { + "epoch": 0.19441069258809235, + "grad_norm": 135.89980489661897, + "learning_rate": 0.00019877511065891673, + "loss": 1.1822, + "step": 80 + }, + { + "epoch": 0.1968408262454435, + "grad_norm": 439.6770852212966, + "learning_rate": 0.00019871162745782478, + "loss": 1.1441, + "step": 81 + }, + { + "epoch": 0.19927095990279464, + "grad_norm": 80.73319798776026, + "learning_rate": 0.0001986465511166394, + "loss": 1.1709, + "step": 82 + }, + { + "epoch": 0.20170109356014582, + "grad_norm": 87.76515297497458, + "learning_rate": 0.00019857988268564953, + "loss": 1.1549, + "step": 83 + }, + { + "epoch": 0.20413122721749696, + "grad_norm": 70.08754986406095, + "learning_rate": 0.00019851162324083932, + "loss": 1.1771, + "step": 84 + }, + { + "epoch": 0.2065613608748481, + "grad_norm": 187.8198997057664, + "learning_rate": 0.0001984417738838709, + "loss": 1.2068, + "step": 85 + }, + { + "epoch": 0.20899149453219928, + "grad_norm": 127.78818684755072, + "learning_rate": 0.00019837033574206646, + "loss": 1.1974, + "step": 86 + }, + { + "epoch": 0.21142162818955043, + "grad_norm": 127.82979216871074, + "learning_rate": 0.0001982973099683902, + "loss": 1.185, + "step": 87 + }, + { + "epoch": 0.21385176184690158, + "grad_norm": 142.35425084857746, + "learning_rate": 0.00019822269774142954, + "loss": 1.2225, + "step": 88 + }, + { + "epoch": 0.21628189550425272, + "grad_norm": 246.64019353564817, + "learning_rate": 0.0001981465002653763, + "loss": 1.2574, + "step": 89 + }, + { + "epoch": 0.2187120291616039, + "grad_norm": 189.88471076285524, + "learning_rate": 0.0001980687187700071, + "loss": 1.1635, + "step": 90 + }, + { + "epoch": 0.22114216281895505, + "grad_norm": 116.65693373141701, + "learning_rate": 0.00019798935451066361, + "loss": 1.1457, + "step": 91 + }, + { + "epoch": 0.2235722964763062, + "grad_norm": 71.76422539970217, + "learning_rate": 0.00019790840876823232, + "loss": 1.2354, + "step": 92 + }, + { + "epoch": 0.22600243013365734, + "grad_norm": 139.42330509386431, + "learning_rate": 0.0001978258828491236, + "loss": 1.18, + "step": 93 + }, + { + "epoch": 0.2284325637910085, + "grad_norm": 131.88308820601443, + "learning_rate": 0.00019774177808525113, + "loss": 1.1868, + "step": 94 + }, + { + "epoch": 0.23086269744835966, + "grad_norm": 85.81071125615291, + "learning_rate": 0.00019765609583400977, + "loss": 1.1814, + "step": 95 + }, + { + "epoch": 0.2332928311057108, + "grad_norm": 84.43756298541064, + "learning_rate": 0.00019756883747825424, + "loss": 1.1658, + "step": 96 + }, + { + "epoch": 0.23572296476306198, + "grad_norm": 114.24245545143974, + "learning_rate": 0.0001974800044262764, + "loss": 1.2497, + "step": 97 + }, + { + "epoch": 0.23815309842041313, + "grad_norm": 76.577511222722, + "learning_rate": 0.00019738959811178272, + "loss": 1.1414, + "step": 98 + }, + { + "epoch": 0.24058323207776428, + "grad_norm": 171.8084830895381, + "learning_rate": 0.00019729761999387103, + "loss": 1.1619, + "step": 99 + }, + { + "epoch": 0.24301336573511542, + "grad_norm": 221.87752250936416, + "learning_rate": 0.00019720407155700707, + "loss": 1.2718, + "step": 100 + }, + { + "epoch": 0.2454434993924666, + "grad_norm": 205.64943975370608, + "learning_rate": 0.00019710895431100046, + "loss": 1.1786, + "step": 101 + }, + { + "epoch": 0.24787363304981774, + "grad_norm": 160.16582903260615, + "learning_rate": 0.00019701226979098037, + "loss": 1.1426, + "step": 102 + }, + { + "epoch": 0.2503037667071689, + "grad_norm": 82.85031394537334, + "learning_rate": 0.00019691401955737072, + "loss": 1.1718, + "step": 103 + }, + { + "epoch": 0.2503037667071689, + "eval_loss": 1.1633374691009521, + "eval_runtime": 52.6182, + "eval_samples_per_second": 14.14, + "eval_steps_per_second": 1.767, + "step": 103 + }, + { + "epoch": 0.25273390036452004, + "grad_norm": 94.74469296109082, + "learning_rate": 0.000196814205195865, + "loss": 1.2255, + "step": 104 + }, + { + "epoch": 0.2551640340218712, + "grad_norm": 126.15797466756656, + "learning_rate": 0.00019671282831740076, + "loss": 1.1623, + "step": 105 + }, + { + "epoch": 0.25759416767922233, + "grad_norm": 79.41156434272008, + "learning_rate": 0.0001966098905581334, + "loss": 1.1606, + "step": 106 + }, + { + "epoch": 0.2600243013365735, + "grad_norm": 70.33104031058372, + "learning_rate": 0.00019650539357941003, + "loss": 1.196, + "step": 107 + }, + { + "epoch": 0.2624544349939247, + "grad_norm": 69.57260733822498, + "learning_rate": 0.0001963993390677424, + "loss": 1.1939, + "step": 108 + }, + { + "epoch": 0.2648845686512758, + "grad_norm": 81.78820691772725, + "learning_rate": 0.00019629172873477995, + "loss": 1.2553, + "step": 109 + }, + { + "epoch": 0.267314702308627, + "grad_norm": 117.06324110268656, + "learning_rate": 0.00019618256431728194, + "loss": 1.2535, + "step": 110 + }, + { + "epoch": 0.26974483596597815, + "grad_norm": 83.26993317104247, + "learning_rate": 0.00019607184757708951, + "loss": 1.157, + "step": 111 + }, + { + "epoch": 0.27217496962332927, + "grad_norm": 51.990829456422375, + "learning_rate": 0.00019595958030109735, + "loss": 1.1274, + "step": 112 + }, + { + "epoch": 0.27460510328068044, + "grad_norm": 119.7487160875729, + "learning_rate": 0.00019584576430122473, + "loss": 1.1422, + "step": 113 + }, + { + "epoch": 0.2770352369380316, + "grad_norm": 88.15636932272304, + "learning_rate": 0.00019573040141438624, + "loss": 1.1599, + "step": 114 + }, + { + "epoch": 0.27946537059538273, + "grad_norm": 62.346402225534774, + "learning_rate": 0.00019561349350246226, + "loss": 1.1909, + "step": 115 + }, + { + "epoch": 0.2818955042527339, + "grad_norm": 76.40612150653034, + "learning_rate": 0.0001954950424522688, + "loss": 1.1646, + "step": 116 + }, + { + "epoch": 0.284325637910085, + "grad_norm": 94.8711613055073, + "learning_rate": 0.00019537505017552716, + "loss": 1.1547, + "step": 117 + }, + { + "epoch": 0.2867557715674362, + "grad_norm": 63.86961661796314, + "learning_rate": 0.00019525351860883293, + "loss": 1.1841, + "step": 118 + }, + { + "epoch": 0.2891859052247874, + "grad_norm": 133.2417924150684, + "learning_rate": 0.00019513044971362494, + "loss": 1.1365, + "step": 119 + }, + { + "epoch": 0.2916160388821385, + "grad_norm": 133.44891510996445, + "learning_rate": 0.00019500584547615333, + "loss": 1.1696, + "step": 120 + }, + { + "epoch": 0.29404617253948967, + "grad_norm": 58.51701768739601, + "learning_rate": 0.00019487970790744774, + "loss": 1.1874, + "step": 121 + }, + { + "epoch": 0.29647630619684084, + "grad_norm": 49.536158238056196, + "learning_rate": 0.00019475203904328474, + "loss": 1.1798, + "step": 122 + }, + { + "epoch": 0.29890643985419196, + "grad_norm": 94.27608706983857, + "learning_rate": 0.000194622840944155, + "loss": 1.2443, + "step": 123 + }, + { + "epoch": 0.30133657351154314, + "grad_norm": 103.868243202843, + "learning_rate": 0.00019449211569523, + "loss": 1.1759, + "step": 124 + }, + { + "epoch": 0.3037667071688943, + "grad_norm": 73.31536435980003, + "learning_rate": 0.00019435986540632843, + "loss": 1.1885, + "step": 125 + }, + { + "epoch": 0.30619684082624543, + "grad_norm": 64.91149114745738, + "learning_rate": 0.00019422609221188207, + "loss": 1.1864, + "step": 126 + }, + { + "epoch": 0.3086269744835966, + "grad_norm": 95.34449184763653, + "learning_rate": 0.00019409079827090145, + "loss": 1.1339, + "step": 127 + }, + { + "epoch": 0.3110571081409477, + "grad_norm": 67.36156159754226, + "learning_rate": 0.00019395398576694086, + "loss": 1.1845, + "step": 128 + }, + { + "epoch": 0.3134872417982989, + "grad_norm": 36.94913176821407, + "learning_rate": 0.00019381565690806328, + "loss": 1.2154, + "step": 129 + }, + { + "epoch": 0.3159173754556501, + "grad_norm": 69.05265214547647, + "learning_rate": 0.00019367581392680457, + "loss": 1.1642, + "step": 130 + }, + { + "epoch": 0.3183475091130012, + "grad_norm": 38.974761165559855, + "learning_rate": 0.00019353445908013755, + "loss": 1.1508, + "step": 131 + }, + { + "epoch": 0.32077764277035237, + "grad_norm": 48.47215142199794, + "learning_rate": 0.00019339159464943557, + "loss": 1.2011, + "step": 132 + }, + { + "epoch": 0.32320777642770354, + "grad_norm": 41.88512063342574, + "learning_rate": 0.00019324722294043558, + "loss": 1.1643, + "step": 133 + }, + { + "epoch": 0.32563791008505466, + "grad_norm": 25.59403215229145, + "learning_rate": 0.00019310134628320114, + "loss": 1.1954, + "step": 134 + }, + { + "epoch": 0.32806804374240583, + "grad_norm": 58.02634988046396, + "learning_rate": 0.00019295396703208453, + "loss": 1.1544, + "step": 135 + }, + { + "epoch": 0.330498177399757, + "grad_norm": 31.26218977398251, + "learning_rate": 0.00019280508756568896, + "loss": 1.1613, + "step": 136 + }, + { + "epoch": 0.33292831105710813, + "grad_norm": 31.81234539284103, + "learning_rate": 0.00019265471028683014, + "loss": 1.1892, + "step": 137 + }, + { + "epoch": 0.3353584447144593, + "grad_norm": 54.44930114675527, + "learning_rate": 0.00019250283762249748, + "loss": 1.2801, + "step": 138 + }, + { + "epoch": 0.3377885783718105, + "grad_norm": 30.320486287732734, + "learning_rate": 0.00019234947202381486, + "loss": 1.1934, + "step": 139 + }, + { + "epoch": 0.3402187120291616, + "grad_norm": 32.76175001943503, + "learning_rate": 0.00019219461596600113, + "loss": 1.1436, + "step": 140 + }, + { + "epoch": 0.34264884568651277, + "grad_norm": 36.802264122697316, + "learning_rate": 0.00019203827194833026, + "loss": 1.1418, + "step": 141 + }, + { + "epoch": 0.3450789793438639, + "grad_norm": 35.03898729580271, + "learning_rate": 0.0001918804424940908, + "loss": 1.2479, + "step": 142 + }, + { + "epoch": 0.34750911300121506, + "grad_norm": 89.58068030461165, + "learning_rate": 0.00019172113015054532, + "loss": 1.2504, + "step": 143 + }, + { + "epoch": 0.34993924665856624, + "grad_norm": 30.05799668441019, + "learning_rate": 0.00019156033748888917, + "loss": 1.1662, + "step": 144 + }, + { + "epoch": 0.35236938031591736, + "grad_norm": 33.80121199203598, + "learning_rate": 0.00019139806710420914, + "loss": 1.1862, + "step": 145 + }, + { + "epoch": 0.35479951397326853, + "grad_norm": 31.510896023067872, + "learning_rate": 0.00019123432161544142, + "loss": 1.147, + "step": 146 + }, + { + "epoch": 0.3572296476306197, + "grad_norm": 32.92613286618093, + "learning_rate": 0.00019106910366532942, + "loss": 1.1421, + "step": 147 + }, + { + "epoch": 0.3596597812879708, + "grad_norm": 245.36013493823395, + "learning_rate": 0.00019090241592038113, + "loss": 1.1306, + "step": 148 + }, + { + "epoch": 0.362089914945322, + "grad_norm": 72.3061625644275, + "learning_rate": 0.000190734261070826, + "loss": 1.1144, + "step": 149 + }, + { + "epoch": 0.3645200486026732, + "grad_norm": 63.77748866336388, + "learning_rate": 0.00019056464183057157, + "loss": 1.1249, + "step": 150 + }, + { + "epoch": 0.3669501822600243, + "grad_norm": 633.2421324308109, + "learning_rate": 0.00019039356093715975, + "loss": 1.1359, + "step": 151 + }, + { + "epoch": 0.36938031591737547, + "grad_norm": 34.456657555313704, + "learning_rate": 0.00019022102115172248, + "loss": 1.1397, + "step": 152 + }, + { + "epoch": 0.3718104495747266, + "grad_norm": 35.21328820959324, + "learning_rate": 0.00019004702525893732, + "loss": 1.1741, + "step": 153 + }, + { + "epoch": 0.37424058323207776, + "grad_norm": 90.32405227187036, + "learning_rate": 0.00018987157606698235, + "loss": 1.1844, + "step": 154 + }, + { + "epoch": 0.37667071688942894, + "grad_norm": 39.348755664527914, + "learning_rate": 0.000189694676407491, + "loss": 1.1216, + "step": 155 + }, + { + "epoch": 0.37910085054678005, + "grad_norm": 58.85540744859834, + "learning_rate": 0.00018951632913550626, + "loss": 1.115, + "step": 156 + }, + { + "epoch": 0.38153098420413123, + "grad_norm": 39.849945227365325, + "learning_rate": 0.0001893365371294346, + "loss": 1.1705, + "step": 157 + }, + { + "epoch": 0.3839611178614824, + "grad_norm": 40.300954908722304, + "learning_rate": 0.0001891553032909996, + "loss": 1.1831, + "step": 158 + }, + { + "epoch": 0.3863912515188335, + "grad_norm": 53.72009888405355, + "learning_rate": 0.00018897263054519498, + "loss": 1.1613, + "step": 159 + }, + { + "epoch": 0.3888213851761847, + "grad_norm": 142.22686975859034, + "learning_rate": 0.0001887885218402375, + "loss": 1.1639, + "step": 160 + }, + { + "epoch": 0.39125151883353587, + "grad_norm": 50.141889086717356, + "learning_rate": 0.00018860298014751944, + "loss": 1.1659, + "step": 161 + }, + { + "epoch": 0.393681652490887, + "grad_norm": 63.25519968311113, + "learning_rate": 0.0001884160084615604, + "loss": 1.168, + "step": 162 + }, + { + "epoch": 0.39611178614823817, + "grad_norm": 50.59325246324073, + "learning_rate": 0.0001882276097999592, + "loss": 1.1202, + "step": 163 + }, + { + "epoch": 0.3985419198055893, + "grad_norm": 58.32587879810431, + "learning_rate": 0.0001880377872033451, + "loss": 1.1587, + "step": 164 + }, + { + "epoch": 0.40097205346294046, + "grad_norm": 211.50882688314653, + "learning_rate": 0.00018784654373532866, + "loss": 1.1551, + "step": 165 + }, + { + "epoch": 0.40340218712029163, + "grad_norm": 47.82888424614203, + "learning_rate": 0.00018765388248245246, + "loss": 1.2274, + "step": 166 + }, + { + "epoch": 0.40583232077764275, + "grad_norm": 97.94922685274778, + "learning_rate": 0.00018745980655414114, + "loss": 1.0872, + "step": 167 + }, + { + "epoch": 0.4082624544349939, + "grad_norm": 44.74994721544976, + "learning_rate": 0.0001872643190826512, + "loss": 1.1244, + "step": 168 + }, + { + "epoch": 0.4106925880923451, + "grad_norm": 53.84692426866845, + "learning_rate": 0.00018706742322302064, + "loss": 1.1576, + "step": 169 + }, + { + "epoch": 0.4131227217496962, + "grad_norm": 54.43599132185614, + "learning_rate": 0.0001868691221530178, + "loss": 1.0957, + "step": 170 + }, + { + "epoch": 0.4155528554070474, + "grad_norm": 39.21766518089018, + "learning_rate": 0.00018666941907309026, + "loss": 1.1625, + "step": 171 + }, + { + "epoch": 0.41798298906439857, + "grad_norm": 49.40030697752548, + "learning_rate": 0.000186468317206313, + "loss": 1.1556, + "step": 172 + }, + { + "epoch": 0.4204131227217497, + "grad_norm": 101.50309647820374, + "learning_rate": 0.0001862658197983366, + "loss": 1.1687, + "step": 173 + }, + { + "epoch": 0.42284325637910086, + "grad_norm": 105.41233861946563, + "learning_rate": 0.0001860619301173347, + "loss": 1.1687, + "step": 174 + }, + { + "epoch": 0.425273390036452, + "grad_norm": 103.99749987770305, + "learning_rate": 0.0001858566514539513, + "loss": 1.144, + "step": 175 + }, + { + "epoch": 0.42770352369380316, + "grad_norm": 78.83490301242213, + "learning_rate": 0.0001856499871212477, + "loss": 1.2318, + "step": 176 + }, + { + "epoch": 0.43013365735115433, + "grad_norm": 62.325757489859335, + "learning_rate": 0.00018544194045464886, + "loss": 1.1092, + "step": 177 + }, + { + "epoch": 0.43256379100850545, + "grad_norm": 81.32804926878099, + "learning_rate": 0.00018523251481188986, + "loss": 1.2233, + "step": 178 + }, + { + "epoch": 0.4349939246658566, + "grad_norm": 38.97928032166606, + "learning_rate": 0.00018502171357296144, + "loss": 1.2371, + "step": 179 + }, + { + "epoch": 0.4374240583232078, + "grad_norm": 82.62345361244209, + "learning_rate": 0.0001848095401400555, + "loss": 1.1562, + "step": 180 + }, + { + "epoch": 0.4398541919805589, + "grad_norm": 47.793381366401626, + "learning_rate": 0.0001845959979375104, + "loss": 1.1249, + "step": 181 + }, + { + "epoch": 0.4422843256379101, + "grad_norm": 53.6022948471739, + "learning_rate": 0.00018438109041175532, + "loss": 1.1415, + "step": 182 + }, + { + "epoch": 0.44471445929526127, + "grad_norm": 65.92717051568573, + "learning_rate": 0.00018416482103125506, + "loss": 1.1748, + "step": 183 + }, + { + "epoch": 0.4471445929526124, + "grad_norm": 59.410481167619494, + "learning_rate": 0.0001839471932864537, + "loss": 1.1399, + "step": 184 + }, + { + "epoch": 0.44957472660996356, + "grad_norm": 64.22740395872977, + "learning_rate": 0.0001837282106897185, + "loss": 1.2193, + "step": 185 + }, + { + "epoch": 0.4520048602673147, + "grad_norm": 54.63497168787729, + "learning_rate": 0.00018350787677528306, + "loss": 1.153, + "step": 186 + }, + { + "epoch": 0.45443499392466585, + "grad_norm": 49.60676029637355, + "learning_rate": 0.00018328619509919044, + "loss": 1.1509, + "step": 187 + }, + { + "epoch": 0.456865127582017, + "grad_norm": 32.29074835877607, + "learning_rate": 0.00018306316923923563, + "loss": 1.1851, + "step": 188 + }, + { + "epoch": 0.45929526123936815, + "grad_norm": 61.13632454163589, + "learning_rate": 0.0001828388027949078, + "loss": 1.1323, + "step": 189 + }, + { + "epoch": 0.4617253948967193, + "grad_norm": 67.48617660835801, + "learning_rate": 0.00018261309938733238, + "loss": 1.1956, + "step": 190 + }, + { + "epoch": 0.4641555285540705, + "grad_norm": 38.31182257784929, + "learning_rate": 0.00018238606265921238, + "loss": 1.1379, + "step": 191 + }, + { + "epoch": 0.4665856622114216, + "grad_norm": 47.30995766708629, + "learning_rate": 0.00018215769627476984, + "loss": 1.1462, + "step": 192 + }, + { + "epoch": 0.4690157958687728, + "grad_norm": 34.57093925891121, + "learning_rate": 0.00018192800391968642, + "loss": 1.1979, + "step": 193 + }, + { + "epoch": 0.47144592952612396, + "grad_norm": 34.45645740457662, + "learning_rate": 0.0001816969893010442, + "loss": 1.1763, + "step": 194 + }, + { + "epoch": 0.4738760631834751, + "grad_norm": 39.21862152859671, + "learning_rate": 0.00018146465614726567, + "loss": 1.1514, + "step": 195 + }, + { + "epoch": 0.47630619684082626, + "grad_norm": 34.765347344568106, + "learning_rate": 0.00018123100820805355, + "loss": 1.1426, + "step": 196 + }, + { + "epoch": 0.4787363304981774, + "grad_norm": 35.04245362239315, + "learning_rate": 0.00018099604925433043, + "loss": 1.143, + "step": 197 + }, + { + "epoch": 0.48116646415552855, + "grad_norm": 103.45636476066032, + "learning_rate": 0.00018075978307817764, + "loss": 1.1713, + "step": 198 + }, + { + "epoch": 0.4835965978128797, + "grad_norm": 43.0297373660821, + "learning_rate": 0.00018052221349277442, + "loss": 1.2226, + "step": 199 + }, + { + "epoch": 0.48602673147023084, + "grad_norm": 32.80474372048966, + "learning_rate": 0.000180283344332336, + "loss": 1.1556, + "step": 200 + }, + { + "epoch": 0.488456865127582, + "grad_norm": 59.42688731224296, + "learning_rate": 0.00018004317945205197, + "loss": 1.1411, + "step": 201 + }, + { + "epoch": 0.4908869987849332, + "grad_norm": 102.0917822407188, + "learning_rate": 0.000179801722728024, + "loss": 1.1309, + "step": 202 + }, + { + "epoch": 0.4933171324422843, + "grad_norm": 309.9346821950787, + "learning_rate": 0.0001795589780572031, + "loss": 1.1953, + "step": 203 + }, + { + "epoch": 0.4957472660996355, + "grad_norm": 344.5019267346993, + "learning_rate": 0.0001793149493573271, + "loss": 1.1524, + "step": 204 + }, + { + "epoch": 0.49817739975698666, + "grad_norm": 50.075205946207085, + "learning_rate": 0.00017906964056685706, + "loss": 1.1495, + "step": 205 + }, + { + "epoch": 0.5006075334143378, + "grad_norm": 132.32227258331488, + "learning_rate": 0.00017882305564491396, + "loss": 1.1976, + "step": 206 + }, + { + "epoch": 0.5006075334143378, + "eval_loss": 1.146019458770752, + "eval_runtime": 52.7816, + "eval_samples_per_second": 14.096, + "eval_steps_per_second": 1.762, + "step": 206 + }, + { + "epoch": 0.503037667071689, + "grad_norm": 138.57200377669218, + "learning_rate": 0.00017857519857121458, + "loss": 1.2159, + "step": 207 + }, + { + "epoch": 0.5054678007290401, + "grad_norm": 268.41109734161546, + "learning_rate": 0.00017832607334600746, + "loss": 1.1748, + "step": 208 + }, + { + "epoch": 0.5078979343863913, + "grad_norm": 72.44153953442401, + "learning_rate": 0.00017807568399000822, + "loss": 1.1758, + "step": 209 + }, + { + "epoch": 0.5103280680437424, + "grad_norm": 97.75400124096738, + "learning_rate": 0.00017782403454433477, + "loss": 1.1004, + "step": 210 + }, + { + "epoch": 0.5127582017010935, + "grad_norm": 84.19522802756285, + "learning_rate": 0.000177571129070442, + "loss": 1.1397, + "step": 211 + }, + { + "epoch": 0.5151883353584447, + "grad_norm": 132.95081835535706, + "learning_rate": 0.00017731697165005618, + "loss": 1.146, + "step": 212 + }, + { + "epoch": 0.5176184690157959, + "grad_norm": 560.3351292126325, + "learning_rate": 0.0001770615663851093, + "loss": 1.1937, + "step": 213 + }, + { + "epoch": 0.520048602673147, + "grad_norm": 252.72862614645885, + "learning_rate": 0.0001768049173976727, + "loss": 1.1213, + "step": 214 + }, + { + "epoch": 0.5224787363304981, + "grad_norm": 356.2985211032981, + "learning_rate": 0.0001765470288298905, + "loss": 1.22, + "step": 215 + }, + { + "epoch": 0.5249088699878494, + "grad_norm": 952.600672502031, + "learning_rate": 0.00017628790484391284, + "loss": 1.1321, + "step": 216 + }, + { + "epoch": 0.5273390036452005, + "grad_norm": 289.9357041930161, + "learning_rate": 0.0001760275496218288, + "loss": 1.1688, + "step": 217 + }, + { + "epoch": 0.5297691373025516, + "grad_norm": 48.69445264741508, + "learning_rate": 0.0001757659673655986, + "loss": 1.1551, + "step": 218 + }, + { + "epoch": 0.5321992709599028, + "grad_norm": 40.15160247154335, + "learning_rate": 0.0001755031622969862, + "loss": 1.1459, + "step": 219 + }, + { + "epoch": 0.534629404617254, + "grad_norm": 44.59390817019205, + "learning_rate": 0.00017523913865749078, + "loss": 1.2012, + "step": 220 + }, + { + "epoch": 0.5370595382746051, + "grad_norm": 30.189717624412484, + "learning_rate": 0.00017497390070827848, + "loss": 1.15, + "step": 221 + }, + { + "epoch": 0.5394896719319563, + "grad_norm": 27.185608574176108, + "learning_rate": 0.00017470745273011362, + "loss": 1.0763, + "step": 222 + }, + { + "epoch": 0.5419198055893074, + "grad_norm": 99.44121390806423, + "learning_rate": 0.00017443979902328956, + "loss": 1.1478, + "step": 223 + }, + { + "epoch": 0.5443499392466585, + "grad_norm": 29.684499344634585, + "learning_rate": 0.00017417094390755934, + "loss": 1.1123, + "step": 224 + }, + { + "epoch": 0.5467800729040098, + "grad_norm": 26.788847114635054, + "learning_rate": 0.00017390089172206592, + "loss": 1.1169, + "step": 225 + }, + { + "epoch": 0.5492102065613609, + "grad_norm": 31.84817878214798, + "learning_rate": 0.00017362964682527218, + "loss": 1.1524, + "step": 226 + }, + { + "epoch": 0.551640340218712, + "grad_norm": 34.834632993822424, + "learning_rate": 0.00017335721359489057, + "loss": 1.1761, + "step": 227 + }, + { + "epoch": 0.5540704738760632, + "grad_norm": 66.6084234453716, + "learning_rate": 0.00017308359642781242, + "loss": 1.1175, + "step": 228 + }, + { + "epoch": 0.5565006075334143, + "grad_norm": 35.15720180142773, + "learning_rate": 0.00017280879974003707, + "loss": 1.2012, + "step": 229 + }, + { + "epoch": 0.5589307411907655, + "grad_norm": 35.975450782756226, + "learning_rate": 0.00017253282796660056, + "loss": 1.1801, + "step": 230 + }, + { + "epoch": 0.5613608748481167, + "grad_norm": 83.49050230764925, + "learning_rate": 0.0001722556855615039, + "loss": 1.1576, + "step": 231 + }, + { + "epoch": 0.5637910085054678, + "grad_norm": 150.44630441002784, + "learning_rate": 0.00017197737699764146, + "loss": 1.1826, + "step": 232 + }, + { + "epoch": 0.5662211421628189, + "grad_norm": 31.322382197739042, + "learning_rate": 0.00017169790676672858, + "loss": 1.1784, + "step": 233 + }, + { + "epoch": 0.56865127582017, + "grad_norm": 33.15983653687515, + "learning_rate": 0.0001714172793792291, + "loss": 1.1411, + "step": 234 + }, + { + "epoch": 0.5710814094775213, + "grad_norm": 22.206850165103052, + "learning_rate": 0.0001711354993642827, + "loss": 1.1772, + "step": 235 + }, + { + "epoch": 0.5735115431348724, + "grad_norm": 43.35721272668955, + "learning_rate": 0.00017085257126963152, + "loss": 1.0915, + "step": 236 + }, + { + "epoch": 0.5759416767922235, + "grad_norm": 29.57234737116712, + "learning_rate": 0.0001705684996615472, + "loss": 1.0977, + "step": 237 + }, + { + "epoch": 0.5783718104495748, + "grad_norm": 42.929644875053214, + "learning_rate": 0.00017028328912475668, + "loss": 1.1782, + "step": 238 + }, + { + "epoch": 0.5808019441069259, + "grad_norm": 32.15711272871687, + "learning_rate": 0.0001699969442623686, + "loss": 1.1855, + "step": 239 + }, + { + "epoch": 0.583232077764277, + "grad_norm": 43.64453730184205, + "learning_rate": 0.00016970946969579887, + "loss": 1.1171, + "step": 240 + }, + { + "epoch": 0.5856622114216282, + "grad_norm": 26.145541544112593, + "learning_rate": 0.00016942087006469592, + "loss": 1.1656, + "step": 241 + }, + { + "epoch": 0.5880923450789793, + "grad_norm": 53.98173886095731, + "learning_rate": 0.00016913115002686616, + "loss": 1.1378, + "step": 242 + }, + { + "epoch": 0.5905224787363305, + "grad_norm": 50.851193586801195, + "learning_rate": 0.00016884031425819853, + "loss": 1.1338, + "step": 243 + }, + { + "epoch": 0.5929526123936817, + "grad_norm": 30.166674036386443, + "learning_rate": 0.0001685483674525891, + "loss": 1.1732, + "step": 244 + }, + { + "epoch": 0.5953827460510328, + "grad_norm": 32.580505176392656, + "learning_rate": 0.00016825531432186543, + "loss": 1.143, + "step": 245 + }, + { + "epoch": 0.5978128797083839, + "grad_norm": 35.087231952662634, + "learning_rate": 0.0001679611595957103, + "loss": 1.212, + "step": 246 + }, + { + "epoch": 0.6002430133657352, + "grad_norm": 44.69578306542608, + "learning_rate": 0.00016766590802158566, + "loss": 1.1527, + "step": 247 + }, + { + "epoch": 0.6026731470230863, + "grad_norm": 39.8378839133733, + "learning_rate": 0.00016736956436465573, + "loss": 1.2174, + "step": 248 + }, + { + "epoch": 0.6051032806804374, + "grad_norm": 25.571860004032857, + "learning_rate": 0.0001670721334077103, + "loss": 1.1031, + "step": 249 + }, + { + "epoch": 0.6075334143377886, + "grad_norm": 27.626061413643438, + "learning_rate": 0.00016677361995108743, + "loss": 1.107, + "step": 250 + }, + { + "epoch": 0.6099635479951397, + "grad_norm": 47.405627339857176, + "learning_rate": 0.00016647402881259598, + "loss": 1.1521, + "step": 251 + }, + { + "epoch": 0.6123936816524909, + "grad_norm": 31.951762409660272, + "learning_rate": 0.00016617336482743794, + "loss": 1.174, + "step": 252 + }, + { + "epoch": 0.6148238153098421, + "grad_norm": 44.304437144236104, + "learning_rate": 0.00016587163284813032, + "loss": 1.1286, + "step": 253 + }, + { + "epoch": 0.6172539489671932, + "grad_norm": 21.990501251879344, + "learning_rate": 0.00016556883774442675, + "loss": 1.1927, + "step": 254 + }, + { + "epoch": 0.6196840826245443, + "grad_norm": 43.91119350789936, + "learning_rate": 0.00016526498440323914, + "loss": 1.1399, + "step": 255 + }, + { + "epoch": 0.6221142162818954, + "grad_norm": 28.064569132249982, + "learning_rate": 0.00016496007772855853, + "loss": 1.1913, + "step": 256 + }, + { + "epoch": 0.6245443499392467, + "grad_norm": 99.97142272243896, + "learning_rate": 0.0001646541226413761, + "loss": 1.1694, + "step": 257 + }, + { + "epoch": 0.6269744835965978, + "grad_norm": 27.12524206817854, + "learning_rate": 0.00016434712407960373, + "loss": 1.2398, + "step": 258 + }, + { + "epoch": 0.6294046172539489, + "grad_norm": 42.99171796479219, + "learning_rate": 0.00016403908699799425, + "loss": 1.145, + "step": 259 + }, + { + "epoch": 0.6318347509113001, + "grad_norm": 24.064938768293658, + "learning_rate": 0.00016373001636806153, + "loss": 1.098, + "step": 260 + }, + { + "epoch": 0.6342648845686513, + "grad_norm": 31.72232981247621, + "learning_rate": 0.00016341991717800023, + "loss": 1.1779, + "step": 261 + }, + { + "epoch": 0.6366950182260024, + "grad_norm": 39.97326887390835, + "learning_rate": 0.00016310879443260528, + "loss": 1.3142, + "step": 262 + }, + { + "epoch": 0.6391251518833536, + "grad_norm": 27.519208072826963, + "learning_rate": 0.00016279665315319114, + "loss": 1.2039, + "step": 263 + }, + { + "epoch": 0.6415552855407047, + "grad_norm": 52.94895557810481, + "learning_rate": 0.00016248349837751062, + "loss": 1.1718, + "step": 264 + }, + { + "epoch": 0.6439854191980559, + "grad_norm": 23.603047222747566, + "learning_rate": 0.0001621693351596739, + "loss": 1.1155, + "step": 265 + }, + { + "epoch": 0.6464155528554071, + "grad_norm": 21.400341520569807, + "learning_rate": 0.00016185416857006647, + "loss": 1.1242, + "step": 266 + }, + { + "epoch": 0.6488456865127582, + "grad_norm": 51.167335508822276, + "learning_rate": 0.00016153800369526788, + "loss": 1.1746, + "step": 267 + }, + { + "epoch": 0.6512758201701093, + "grad_norm": 26.219581065473573, + "learning_rate": 0.00016122084563796905, + "loss": 1.0836, + "step": 268 + }, + { + "epoch": 0.6537059538274606, + "grad_norm": 56.820249886600706, + "learning_rate": 0.0001609026995168904, + "loss": 1.1625, + "step": 269 + }, + { + "epoch": 0.6561360874848117, + "grad_norm": 37.43384869992443, + "learning_rate": 0.00016058357046669898, + "loss": 1.2143, + "step": 270 + }, + { + "epoch": 0.6585662211421628, + "grad_norm": 31.885237168871473, + "learning_rate": 0.00016026346363792567, + "loss": 1.1536, + "step": 271 + }, + { + "epoch": 0.660996354799514, + "grad_norm": 34.66147983279251, + "learning_rate": 0.00015994238419688199, + "loss": 1.2095, + "step": 272 + }, + { + "epoch": 0.6634264884568651, + "grad_norm": 86.90365354594917, + "learning_rate": 0.00015962033732557686, + "loss": 1.1149, + "step": 273 + }, + { + "epoch": 0.6658566221142163, + "grad_norm": 52.21177462889067, + "learning_rate": 0.00015929732822163287, + "loss": 1.1861, + "step": 274 + }, + { + "epoch": 0.6682867557715675, + "grad_norm": 92.11184701145604, + "learning_rate": 0.00015897336209820239, + "loss": 1.1853, + "step": 275 + }, + { + "epoch": 0.6707168894289186, + "grad_norm": 30.662475573811115, + "learning_rate": 0.00015864844418388342, + "loss": 1.0912, + "step": 276 + }, + { + "epoch": 0.6731470230862697, + "grad_norm": 26.15855468837027, + "learning_rate": 0.00015832257972263523, + "loss": 1.1618, + "step": 277 + }, + { + "epoch": 0.675577156743621, + "grad_norm": 41.14250673970726, + "learning_rate": 0.00015799577397369375, + "loss": 1.1499, + "step": 278 + }, + { + "epoch": 0.6780072904009721, + "grad_norm": 31.93253644773631, + "learning_rate": 0.00015766803221148673, + "loss": 1.1229, + "step": 279 + }, + { + "epoch": 0.6804374240583232, + "grad_norm": 39.87120131585165, + "learning_rate": 0.00015733935972554844, + "loss": 1.1647, + "step": 280 + }, + { + "epoch": 0.6828675577156743, + "grad_norm": 52.741654062271124, + "learning_rate": 0.0001570097618204345, + "loss": 1.1362, + "step": 281 + }, + { + "epoch": 0.6852976913730255, + "grad_norm": 33.13137686002526, + "learning_rate": 0.0001566792438156362, + "loss": 1.1825, + "step": 282 + }, + { + "epoch": 0.6877278250303767, + "grad_norm": 20.284041564566042, + "learning_rate": 0.00015634781104549442, + "loss": 1.1439, + "step": 283 + }, + { + "epoch": 0.6901579586877278, + "grad_norm": 164.9222932471453, + "learning_rate": 0.00015601546885911404, + "loss": 1.122, + "step": 284 + }, + { + "epoch": 0.692588092345079, + "grad_norm": 27.092346730158148, + "learning_rate": 0.00015568222262027717, + "loss": 1.157, + "step": 285 + }, + { + "epoch": 0.6950182260024301, + "grad_norm": 39.46898996008012, + "learning_rate": 0.00015534807770735664, + "loss": 1.1092, + "step": 286 + }, + { + "epoch": 0.6974483596597812, + "grad_norm": 30.00942949300714, + "learning_rate": 0.00015501303951322943, + "loss": 1.243, + "step": 287 + }, + { + "epoch": 0.6998784933171325, + "grad_norm": 31.435817418038887, + "learning_rate": 0.00015467711344518942, + "loss": 1.1034, + "step": 288 + }, + { + "epoch": 0.7023086269744836, + "grad_norm": 54.53572773177548, + "learning_rate": 0.00015434030492486023, + "loss": 1.2216, + "step": 289 + }, + { + "epoch": 0.7047387606318347, + "grad_norm": 24.51082708234768, + "learning_rate": 0.00015400261938810757, + "loss": 1.1532, + "step": 290 + }, + { + "epoch": 0.707168894289186, + "grad_norm": 104.85480514443172, + "learning_rate": 0.00015366406228495172, + "loss": 1.1156, + "step": 291 + }, + { + "epoch": 0.7095990279465371, + "grad_norm": 26.398830117870997, + "learning_rate": 0.0001533246390794794, + "loss": 1.0934, + "step": 292 + }, + { + "epoch": 0.7120291616038882, + "grad_norm": 25.062392373037707, + "learning_rate": 0.00015298435524975572, + "loss": 1.1453, + "step": 293 + }, + { + "epoch": 0.7144592952612394, + "grad_norm": 25.385505352027444, + "learning_rate": 0.0001526432162877356, + "loss": 1.1359, + "step": 294 + }, + { + "epoch": 0.7168894289185905, + "grad_norm": 18.00146943000571, + "learning_rate": 0.00015230122769917527, + "loss": 1.1129, + "step": 295 + }, + { + "epoch": 0.7193195625759417, + "grad_norm": 22.55383473288135, + "learning_rate": 0.00015195839500354335, + "loss": 1.142, + "step": 296 + }, + { + "epoch": 0.7217496962332929, + "grad_norm": 30.013723395820165, + "learning_rate": 0.00015161472373393186, + "loss": 1.1379, + "step": 297 + }, + { + "epoch": 0.724179829890644, + "grad_norm": 40.566201545240425, + "learning_rate": 0.0001512702194369668, + "loss": 1.1326, + "step": 298 + }, + { + "epoch": 0.7266099635479951, + "grad_norm": 27.34716639907029, + "learning_rate": 0.00015092488767271857, + "loss": 1.0782, + "step": 299 + }, + { + "epoch": 0.7290400972053463, + "grad_norm": 45.0837594669075, + "learning_rate": 0.00015057873401461253, + "loss": 1.2054, + "step": 300 + }, + { + "epoch": 0.7314702308626975, + "grad_norm": 22.39794101270309, + "learning_rate": 0.00015023176404933874, + "loss": 1.1052, + "step": 301 + }, + { + "epoch": 0.7339003645200486, + "grad_norm": 21.818512025585306, + "learning_rate": 0.00014988398337676198, + "loss": 1.1664, + "step": 302 + }, + { + "epoch": 0.7363304981773997, + "grad_norm": 33.09386163968815, + "learning_rate": 0.00014953539760983122, + "loss": 1.1364, + "step": 303 + }, + { + "epoch": 0.7387606318347509, + "grad_norm": 26.3253592215911, + "learning_rate": 0.00014918601237448923, + "loss": 1.1093, + "step": 304 + }, + { + "epoch": 0.741190765492102, + "grad_norm": 32.54878723405212, + "learning_rate": 0.0001488358333095816, + "loss": 1.182, + "step": 305 + }, + { + "epoch": 0.7436208991494532, + "grad_norm": 28.645473311846015, + "learning_rate": 0.0001484848660667658, + "loss": 1.2064, + "step": 306 + }, + { + "epoch": 0.7460510328068044, + "grad_norm": 29.02693042820854, + "learning_rate": 0.00014813311631041995, + "loss": 1.1545, + "step": 307 + }, + { + "epoch": 0.7484811664641555, + "grad_norm": 20.28193033099828, + "learning_rate": 0.00014778058971755154, + "loss": 1.1885, + "step": 308 + }, + { + "epoch": 0.7509113001215066, + "grad_norm": 121.86121371804961, + "learning_rate": 0.00014742729197770552, + "loss": 1.095, + "step": 309 + }, + { + "epoch": 0.7509113001215066, + "eval_loss": 1.133868932723999, + "eval_runtime": 52.6711, + "eval_samples_per_second": 14.125, + "eval_steps_per_second": 1.766, + "step": 309 + }, + { + "epoch": 0.7533414337788579, + "grad_norm": 50.1793074315811, + "learning_rate": 0.00014707322879287276, + "loss": 1.1679, + "step": 310 + }, + { + "epoch": 0.755771567436209, + "grad_norm": 31.791309498678103, + "learning_rate": 0.00014671840587739783, + "loss": 1.1277, + "step": 311 + }, + { + "epoch": 0.7582017010935601, + "grad_norm": 56.88911226488106, + "learning_rate": 0.00014636282895788688, + "loss": 1.1492, + "step": 312 + }, + { + "epoch": 0.7606318347509113, + "grad_norm": 117.29437608667352, + "learning_rate": 0.00014600650377311522, + "loss": 1.1123, + "step": 313 + }, + { + "epoch": 0.7630619684082625, + "grad_norm": 107.56728772749254, + "learning_rate": 0.00014564943607393459, + "loss": 1.171, + "step": 314 + }, + { + "epoch": 0.7654921020656136, + "grad_norm": 34.085830256919685, + "learning_rate": 0.0001452916316231805, + "loss": 1.1854, + "step": 315 + }, + { + "epoch": 0.7679222357229648, + "grad_norm": 23.625747202851176, + "learning_rate": 0.000144933096195579, + "loss": 1.1622, + "step": 316 + }, + { + "epoch": 0.7703523693803159, + "grad_norm": 56.9917185309248, + "learning_rate": 0.00014457383557765386, + "loss": 1.2037, + "step": 317 + }, + { + "epoch": 0.772782503037667, + "grad_norm": 34.55554043725056, + "learning_rate": 0.00014421385556763266, + "loss": 1.1273, + "step": 318 + }, + { + "epoch": 0.7752126366950183, + "grad_norm": 34.205286759913115, + "learning_rate": 0.00014385316197535372, + "loss": 1.2039, + "step": 319 + }, + { + "epoch": 0.7776427703523694, + "grad_norm": 27.30015395778206, + "learning_rate": 0.00014349176062217195, + "loss": 1.1903, + "step": 320 + }, + { + "epoch": 0.7800729040097205, + "grad_norm": 23.077745147127867, + "learning_rate": 0.00014312965734086518, + "loss": 1.1539, + "step": 321 + }, + { + "epoch": 0.7825030376670717, + "grad_norm": 26.22112568156326, + "learning_rate": 0.00014276685797553977, + "loss": 1.1807, + "step": 322 + }, + { + "epoch": 0.7849331713244229, + "grad_norm": 34.813719314948514, + "learning_rate": 0.0001424033683815365, + "loss": 1.1247, + "step": 323 + }, + { + "epoch": 0.787363304981774, + "grad_norm": 27.109609629038324, + "learning_rate": 0.00014203919442533597, + "loss": 1.1735, + "step": 324 + }, + { + "epoch": 0.7897934386391251, + "grad_norm": 144.91672798575476, + "learning_rate": 0.00014167434198446383, + "loss": 1.1007, + "step": 325 + }, + { + "epoch": 0.7922235722964763, + "grad_norm": 42.19042828736382, + "learning_rate": 0.00014130881694739616, + "loss": 1.1398, + "step": 326 + }, + { + "epoch": 0.7946537059538274, + "grad_norm": 43.00144921766715, + "learning_rate": 0.00014094262521346427, + "loss": 1.1712, + "step": 327 + }, + { + "epoch": 0.7970838396111786, + "grad_norm": 26.343159670729925, + "learning_rate": 0.0001405757726927595, + "loss": 1.2103, + "step": 328 + }, + { + "epoch": 0.7995139732685298, + "grad_norm": 31.68271222195729, + "learning_rate": 0.00014020826530603776, + "loss": 1.1578, + "step": 329 + }, + { + "epoch": 0.8019441069258809, + "grad_norm": 39.08920292536896, + "learning_rate": 0.00013984010898462416, + "loss": 1.1377, + "step": 330 + }, + { + "epoch": 0.804374240583232, + "grad_norm": 34.56898084569197, + "learning_rate": 0.00013947130967031717, + "loss": 1.1886, + "step": 331 + }, + { + "epoch": 0.8068043742405833, + "grad_norm": 42.016356369933895, + "learning_rate": 0.00013910187331529276, + "loss": 1.1577, + "step": 332 + }, + { + "epoch": 0.8092345078979344, + "grad_norm": 21.25953597879822, + "learning_rate": 0.00013873180588200827, + "loss": 1.1259, + "step": 333 + }, + { + "epoch": 0.8116646415552855, + "grad_norm": 39.49634140985428, + "learning_rate": 0.0001383611133431062, + "loss": 1.173, + "step": 334 + }, + { + "epoch": 0.8140947752126367, + "grad_norm": 29.837690582268863, + "learning_rate": 0.00013798980168131794, + "loss": 1.1322, + "step": 335 + }, + { + "epoch": 0.8165249088699879, + "grad_norm": 23.510451396240928, + "learning_rate": 0.000137617876889367, + "loss": 1.1392, + "step": 336 + }, + { + "epoch": 0.818955042527339, + "grad_norm": 19.183017199526635, + "learning_rate": 0.00013724534496987247, + "loss": 1.157, + "step": 337 + }, + { + "epoch": 0.8213851761846902, + "grad_norm": 51.85037647612581, + "learning_rate": 0.0001368722119352521, + "loss": 1.1255, + "step": 338 + }, + { + "epoch": 0.8238153098420413, + "grad_norm": 31.635699477838273, + "learning_rate": 0.00013649848380762513, + "loss": 1.1429, + "step": 339 + }, + { + "epoch": 0.8262454434993924, + "grad_norm": 39.6479124739029, + "learning_rate": 0.00013612416661871533, + "loss": 1.1609, + "step": 340 + }, + { + "epoch": 0.8286755771567437, + "grad_norm": 21.453228401011238, + "learning_rate": 0.0001357492664097534, + "loss": 1.1247, + "step": 341 + }, + { + "epoch": 0.8311057108140948, + "grad_norm": 28.514958428145494, + "learning_rate": 0.00013537378923137973, + "loss": 1.0845, + "step": 342 + }, + { + "epoch": 0.8335358444714459, + "grad_norm": 26.98663985253516, + "learning_rate": 0.00013499774114354655, + "loss": 1.1092, + "step": 343 + }, + { + "epoch": 0.8359659781287971, + "grad_norm": 30.76143424141064, + "learning_rate": 0.00013462112821542016, + "loss": 1.1759, + "step": 344 + }, + { + "epoch": 0.8383961117861483, + "grad_norm": 39.023771167108656, + "learning_rate": 0.0001342439565252831, + "loss": 1.1024, + "step": 345 + }, + { + "epoch": 0.8408262454434994, + "grad_norm": 29.787639099820225, + "learning_rate": 0.0001338662321604358, + "loss": 1.2141, + "step": 346 + }, + { + "epoch": 0.8432563791008505, + "grad_norm": 25.60634301240642, + "learning_rate": 0.00013348796121709862, + "loss": 1.1244, + "step": 347 + }, + { + "epoch": 0.8456865127582017, + "grad_norm": 76.98542857181108, + "learning_rate": 0.00013310914980031334, + "loss": 1.19, + "step": 348 + }, + { + "epoch": 0.8481166464155528, + "grad_norm": 110.28982985071892, + "learning_rate": 0.0001327298040238446, + "loss": 1.1295, + "step": 349 + }, + { + "epoch": 0.850546780072904, + "grad_norm": 22.610631125609732, + "learning_rate": 0.0001323499300100811, + "loss": 1.1445, + "step": 350 + }, + { + "epoch": 0.8529769137302552, + "grad_norm": 29.958515973723888, + "learning_rate": 0.00013196953388993726, + "loss": 1.2048, + "step": 351 + }, + { + "epoch": 0.8554070473876063, + "grad_norm": 30.691798031468103, + "learning_rate": 0.00013158862180275363, + "loss": 1.1628, + "step": 352 + }, + { + "epoch": 0.8578371810449574, + "grad_norm": 28.568576369680258, + "learning_rate": 0.00013120719989619833, + "loss": 1.0899, + "step": 353 + }, + { + "epoch": 0.8602673147023087, + "grad_norm": 42.12623456189728, + "learning_rate": 0.0001308252743261675, + "loss": 1.1451, + "step": 354 + }, + { + "epoch": 0.8626974483596598, + "grad_norm": 112.39248005736448, + "learning_rate": 0.00013044285125668614, + "loss": 1.154, + "step": 355 + }, + { + "epoch": 0.8651275820170109, + "grad_norm": 28.013602355549782, + "learning_rate": 0.0001300599368598086, + "loss": 1.1937, + "step": 356 + }, + { + "epoch": 0.8675577156743621, + "grad_norm": 27.763517972300694, + "learning_rate": 0.0001296765373155188, + "loss": 1.1243, + "step": 357 + }, + { + "epoch": 0.8699878493317132, + "grad_norm": 112.85815824767063, + "learning_rate": 0.0001292926588116308, + "loss": 1.1595, + "step": 358 + }, + { + "epoch": 0.8724179829890644, + "grad_norm": 27.085127886556087, + "learning_rate": 0.00012890830754368855, + "loss": 1.1196, + "step": 359 + }, + { + "epoch": 0.8748481166464156, + "grad_norm": 31.56336829128541, + "learning_rate": 0.00012852348971486617, + "loss": 1.1231, + "step": 360 + }, + { + "epoch": 0.8772782503037667, + "grad_norm": 31.904393738907178, + "learning_rate": 0.0001281382115358679, + "loss": 1.097, + "step": 361 + }, + { + "epoch": 0.8797083839611178, + "grad_norm": 25.034453894065827, + "learning_rate": 0.00012775247922482748, + "loss": 1.1246, + "step": 362 + }, + { + "epoch": 0.8821385176184691, + "grad_norm": 33.221958266501474, + "learning_rate": 0.0001273662990072083, + "loss": 1.1189, + "step": 363 + }, + { + "epoch": 0.8845686512758202, + "grad_norm": 26.638980136773224, + "learning_rate": 0.00012697967711570242, + "loss": 1.1315, + "step": 364 + }, + { + "epoch": 0.8869987849331713, + "grad_norm": 27.231479341362885, + "learning_rate": 0.00012659261979013043, + "loss": 1.1464, + "step": 365 + }, + { + "epoch": 0.8894289185905225, + "grad_norm": 19.654091006710207, + "learning_rate": 0.0001262051332773404, + "loss": 1.1271, + "step": 366 + }, + { + "epoch": 0.8918590522478737, + "grad_norm": 50.3934263865559, + "learning_rate": 0.00012581722383110718, + "loss": 1.1002, + "step": 367 + }, + { + "epoch": 0.8942891859052248, + "grad_norm": 20.25952031318632, + "learning_rate": 0.00012542889771203166, + "loss": 1.0629, + "step": 368 + }, + { + "epoch": 0.8967193195625759, + "grad_norm": 19.16914945262315, + "learning_rate": 0.00012504016118743935, + "loss": 1.1597, + "step": 369 + }, + { + "epoch": 0.8991494532199271, + "grad_norm": 35.65941460173898, + "learning_rate": 0.00012465102053127957, + "loss": 1.1501, + "step": 370 + }, + { + "epoch": 0.9015795868772782, + "grad_norm": 26.093269180565315, + "learning_rate": 0.00012426148202402404, + "loss": 1.1455, + "step": 371 + }, + { + "epoch": 0.9040097205346294, + "grad_norm": 30.928987547424892, + "learning_rate": 0.00012387155195256537, + "loss": 1.1392, + "step": 372 + }, + { + "epoch": 0.9064398541919806, + "grad_norm": 20.17512596846915, + "learning_rate": 0.00012348123661011601, + "loss": 1.1196, + "step": 373 + }, + { + "epoch": 0.9088699878493317, + "grad_norm": 24.380789157356805, + "learning_rate": 0.00012309054229610623, + "loss": 1.1, + "step": 374 + }, + { + "epoch": 0.9113001215066828, + "grad_norm": 95.49408387682203, + "learning_rate": 0.00012269947531608276, + "loss": 1.1825, + "step": 375 + }, + { + "epoch": 0.913730255164034, + "grad_norm": 23.635286340368726, + "learning_rate": 0.0001223080419816069, + "loss": 1.1717, + "step": 376 + }, + { + "epoch": 0.9161603888213852, + "grad_norm": 21.942478063568313, + "learning_rate": 0.00012191624861015254, + "loss": 1.1661, + "step": 377 + }, + { + "epoch": 0.9185905224787363, + "grad_norm": 74.12601397150299, + "learning_rate": 0.00012152410152500453, + "loss": 1.1967, + "step": 378 + }, + { + "epoch": 0.9210206561360875, + "grad_norm": 37.26720386499629, + "learning_rate": 0.00012113160705515625, + "loss": 1.1566, + "step": 379 + }, + { + "epoch": 0.9234507897934386, + "grad_norm": 34.080854733427635, + "learning_rate": 0.00012073877153520776, + "loss": 1.0847, + "step": 380 + }, + { + "epoch": 0.9258809234507898, + "grad_norm": 26.50842916877183, + "learning_rate": 0.0001203456013052634, + "loss": 1.0824, + "step": 381 + }, + { + "epoch": 0.928311057108141, + "grad_norm": 37.92039651416441, + "learning_rate": 0.00011995210271082944, + "loss": 1.1485, + "step": 382 + }, + { + "epoch": 0.9307411907654921, + "grad_norm": 38.56931832374284, + "learning_rate": 0.00011955828210271187, + "loss": 1.0737, + "step": 383 + }, + { + "epoch": 0.9331713244228432, + "grad_norm": 24.419015296791592, + "learning_rate": 0.0001191641458369136, + "loss": 1.1208, + "step": 384 + }, + { + "epoch": 0.9356014580801945, + "grad_norm": 28.75379656643836, + "learning_rate": 0.00011876970027453222, + "loss": 1.1071, + "step": 385 + }, + { + "epoch": 0.9380315917375456, + "grad_norm": 138.39305133994282, + "learning_rate": 0.00011837495178165706, + "loss": 1.1405, + "step": 386 + }, + { + "epoch": 0.9404617253948967, + "grad_norm": 22.200435229928654, + "learning_rate": 0.00011797990672926652, + "loss": 1.124, + "step": 387 + }, + { + "epoch": 0.9428918590522479, + "grad_norm": 40.21978055156661, + "learning_rate": 0.00011758457149312538, + "loss": 1.1875, + "step": 388 + }, + { + "epoch": 0.945321992709599, + "grad_norm": 23.592672098002485, + "learning_rate": 0.00011718895245368167, + "loss": 1.1748, + "step": 389 + }, + { + "epoch": 0.9477521263669502, + "grad_norm": 17.463183827323444, + "learning_rate": 0.00011679305599596393, + "loss": 1.1794, + "step": 390 + }, + { + "epoch": 0.9501822600243013, + "grad_norm": 36.219441964332646, + "learning_rate": 0.00011639688850947799, + "loss": 1.1459, + "step": 391 + }, + { + "epoch": 0.9526123936816525, + "grad_norm": 23.727472560980413, + "learning_rate": 0.00011600045638810386, + "loss": 1.076, + "step": 392 + }, + { + "epoch": 0.9550425273390036, + "grad_norm": 57.63284414960702, + "learning_rate": 0.00011560376602999272, + "loss": 1.1919, + "step": 393 + }, + { + "epoch": 0.9574726609963548, + "grad_norm": 40.23829998466358, + "learning_rate": 0.00011520682383746333, + "loss": 1.0701, + "step": 394 + }, + { + "epoch": 0.959902794653706, + "grad_norm": 58.2018640218209, + "learning_rate": 0.00011480963621689905, + "loss": 1.1745, + "step": 395 + }, + { + "epoch": 0.9623329283110571, + "grad_norm": 27.693448904288406, + "learning_rate": 0.00011441220957864421, + "loss": 1.1323, + "step": 396 + }, + { + "epoch": 0.9647630619684082, + "grad_norm": 34.94430005820724, + "learning_rate": 0.00011401455033690076, + "loss": 1.1497, + "step": 397 + }, + { + "epoch": 0.9671931956257594, + "grad_norm": 17.521922247865188, + "learning_rate": 0.00011361666490962468, + "loss": 1.1319, + "step": 398 + }, + { + "epoch": 0.9696233292831106, + "grad_norm": 25.886687159935246, + "learning_rate": 0.00011321855971842243, + "loss": 1.1418, + "step": 399 + }, + { + "epoch": 0.9720534629404617, + "grad_norm": 31.388154506614836, + "learning_rate": 0.00011282024118844738, + "loss": 1.1282, + "step": 400 + }, + { + "epoch": 0.9744835965978129, + "grad_norm": 27.458601253675347, + "learning_rate": 0.00011242171574829599, + "loss": 1.1647, + "step": 401 + }, + { + "epoch": 0.976913730255164, + "grad_norm": 25.922873022924257, + "learning_rate": 0.00011202298982990411, + "loss": 1.091, + "step": 402 + }, + { + "epoch": 0.9793438639125152, + "grad_norm": 20.129467589894766, + "learning_rate": 0.00011162406986844323, + "loss": 1.2, + "step": 403 + }, + { + "epoch": 0.9817739975698664, + "grad_norm": 25.11892123906363, + "learning_rate": 0.00011122496230221645, + "loss": 1.0731, + "step": 404 + }, + { + "epoch": 0.9842041312272175, + "grad_norm": 26.416884392453543, + "learning_rate": 0.00011082567357255484, + "loss": 1.1836, + "step": 405 + }, + { + "epoch": 0.9866342648845686, + "grad_norm": 18.768078773975784, + "learning_rate": 0.00011042621012371322, + "loss": 1.1275, + "step": 406 + }, + { + "epoch": 0.9890643985419199, + "grad_norm": 22.275756523796257, + "learning_rate": 0.00011002657840276627, + "loss": 1.1228, + "step": 407 + }, + { + "epoch": 0.991494532199271, + "grad_norm": 29.605335344828575, + "learning_rate": 0.00010962678485950455, + "loss": 1.0255, + "step": 408 + }, + { + "epoch": 0.9939246658566221, + "grad_norm": 41.1718200727633, + "learning_rate": 0.00010922683594633021, + "loss": 1.1876, + "step": 409 + }, + { + "epoch": 0.9963547995139733, + "grad_norm": 20.46397475257922, + "learning_rate": 0.00010882673811815304, + "loss": 1.1168, + "step": 410 + }, + { + "epoch": 0.9987849331713244, + "grad_norm": 21.084924025016928, + "learning_rate": 0.00010842649783228624, + "loss": 1.1948, + "step": 411 + }, + { + "epoch": 1.0, + "grad_norm": 21.084924025016928, + "learning_rate": 0.00010802612154834211, + "loss": 1.1076, + "step": 412 + }, + { + "epoch": 1.0, + "eval_loss": 1.121336579322815, + "eval_runtime": 52.7043, + "eval_samples_per_second": 14.116, + "eval_steps_per_second": 1.765, + "step": 412 + }, + { + "epoch": 1.0024301336573511, + "grad_norm": 35.25758968935371, + "learning_rate": 0.00010762561572812788, + "loss": 1.1335, + "step": 413 + }, + { + "epoch": 1.0048602673147022, + "grad_norm": 20.78715726366623, + "learning_rate": 0.0001072249868355415, + "loss": 1.1003, + "step": 414 + }, + { + "epoch": 1.0072904009720534, + "grad_norm": 31.01116633763719, + "learning_rate": 0.0001068242413364671, + "loss": 1.1225, + "step": 415 + }, + { + "epoch": 1.0097205346294047, + "grad_norm": 19.050638172672897, + "learning_rate": 0.00010642338569867086, + "loss": 1.0595, + "step": 416 + }, + { + "epoch": 1.0121506682867558, + "grad_norm": 41.54235389574412, + "learning_rate": 0.00010602242639169648, + "loss": 1.1719, + "step": 417 + }, + { + "epoch": 1.014580801944107, + "grad_norm": 41.34218206464363, + "learning_rate": 0.00010562136988676078, + "loss": 1.1292, + "step": 418 + }, + { + "epoch": 1.017010935601458, + "grad_norm": 32.436985934581934, + "learning_rate": 0.0001052202226566494, + "loss": 1.1244, + "step": 419 + }, + { + "epoch": 1.0194410692588092, + "grad_norm": 19.631825450596665, + "learning_rate": 0.0001048189911756121, + "loss": 1.1323, + "step": 420 + }, + { + "epoch": 1.0218712029161603, + "grad_norm": 23.275029440216805, + "learning_rate": 0.00010441768191925847, + "loss": 1.1605, + "step": 421 + }, + { + "epoch": 1.0243013365735116, + "grad_norm": 21.44161988455765, + "learning_rate": 0.0001040163013644533, + "loss": 1.0886, + "step": 422 + }, + { + "epoch": 1.0267314702308628, + "grad_norm": 31.9765167465431, + "learning_rate": 0.00010361485598921212, + "loss": 1.1378, + "step": 423 + }, + { + "epoch": 1.0291616038882139, + "grad_norm": 22.340741556027833, + "learning_rate": 0.00010321335227259661, + "loss": 1.1278, + "step": 424 + }, + { + "epoch": 1.031591737545565, + "grad_norm": 29.27286563037163, + "learning_rate": 0.00010281179669461005, + "loss": 1.1186, + "step": 425 + }, + { + "epoch": 1.034021871202916, + "grad_norm": 65.85877610734141, + "learning_rate": 0.00010241019573609269, + "loss": 1.1673, + "step": 426 + }, + { + "epoch": 1.0364520048602672, + "grad_norm": 35.173784527846884, + "learning_rate": 0.00010200855587861724, + "loss": 1.0903, + "step": 427 + }, + { + "epoch": 1.0388821385176186, + "grad_norm": 29.91546238299385, + "learning_rate": 0.00010160688360438419, + "loss": 1.0884, + "step": 428 + }, + { + "epoch": 1.0413122721749697, + "grad_norm": 26.873308685100223, + "learning_rate": 0.0001012051853961172, + "loss": 1.1296, + "step": 429 + }, + { + "epoch": 1.0437424058323208, + "grad_norm": 25.90622275527891, + "learning_rate": 0.00010080346773695853, + "loss": 1.1349, + "step": 430 + }, + { + "epoch": 1.046172539489672, + "grad_norm": 21.388851321680434, + "learning_rate": 0.00010040173711036431, + "loss": 1.0947, + "step": 431 + }, + { + "epoch": 1.048602673147023, + "grad_norm": 31.206506843880053, + "learning_rate": 0.0001, + "loss": 1.1541, + "step": 432 + }, + { + "epoch": 1.0510328068043742, + "grad_norm": 19.486767323523555, + "learning_rate": 9.959826288963571e-05, + "loss": 1.1574, + "step": 433 + }, + { + "epoch": 1.0534629404617255, + "grad_norm": 102.81325604770561, + "learning_rate": 9.919653226304148e-05, + "loss": 1.1762, + "step": 434 + }, + { + "epoch": 1.0558930741190766, + "grad_norm": 17.18170280255333, + "learning_rate": 9.879481460388282e-05, + "loss": 1.1208, + "step": 435 + }, + { + "epoch": 1.0583232077764277, + "grad_norm": 29.88292309614927, + "learning_rate": 9.839311639561583e-05, + "loss": 1.1114, + "step": 436 + }, + { + "epoch": 1.0607533414337789, + "grad_norm": 23.50392429976475, + "learning_rate": 9.799144412138275e-05, + "loss": 1.2026, + "step": 437 + }, + { + "epoch": 1.06318347509113, + "grad_norm": 24.794408487434744, + "learning_rate": 9.758980426390732e-05, + "loss": 1.1587, + "step": 438 + }, + { + "epoch": 1.065613608748481, + "grad_norm": 38.726295800289655, + "learning_rate": 9.718820330538998e-05, + "loss": 1.14, + "step": 439 + }, + { + "epoch": 1.0680437424058322, + "grad_norm": 31.152256057732977, + "learning_rate": 9.678664772740343e-05, + "loss": 1.0882, + "step": 440 + }, + { + "epoch": 1.0704738760631836, + "grad_norm": 65.73380095432839, + "learning_rate": 9.638514401078788e-05, + "loss": 1.1213, + "step": 441 + }, + { + "epoch": 1.0729040097205347, + "grad_norm": 69.07317297910537, + "learning_rate": 9.598369863554673e-05, + "loss": 1.1285, + "step": 442 + }, + { + "epoch": 1.0753341433778858, + "grad_norm": 62.55969576940585, + "learning_rate": 9.558231808074156e-05, + "loss": 1.1252, + "step": 443 + }, + { + "epoch": 1.077764277035237, + "grad_norm": 26.35106444530265, + "learning_rate": 9.51810088243879e-05, + "loss": 1.108, + "step": 444 + }, + { + "epoch": 1.080194410692588, + "grad_norm": 76.70006955440516, + "learning_rate": 9.477977734335061e-05, + "loss": 1.1144, + "step": 445 + }, + { + "epoch": 1.0826245443499392, + "grad_norm": 22.376983523395264, + "learning_rate": 9.437863011323922e-05, + "loss": 1.173, + "step": 446 + }, + { + "epoch": 1.0850546780072905, + "grad_norm": 33.51322062360491, + "learning_rate": 9.397757360830353e-05, + "loss": 1.089, + "step": 447 + }, + { + "epoch": 1.0874848116646416, + "grad_norm": 24.87252097324779, + "learning_rate": 9.357661430132915e-05, + "loss": 1.098, + "step": 448 + }, + { + "epoch": 1.0899149453219927, + "grad_norm": 48.95371674408058, + "learning_rate": 9.317575866353292e-05, + "loss": 1.0491, + "step": 449 + }, + { + "epoch": 1.0923450789793439, + "grad_norm": 25.50740340531524, + "learning_rate": 9.277501316445854e-05, + "loss": 1.0939, + "step": 450 + }, + { + "epoch": 1.094775212636695, + "grad_norm": 27.60998778610316, + "learning_rate": 9.23743842718721e-05, + "loss": 1.1564, + "step": 451 + }, + { + "epoch": 1.097205346294046, + "grad_norm": 63.99226186124907, + "learning_rate": 9.197387845165793e-05, + "loss": 1.1088, + "step": 452 + }, + { + "epoch": 1.0996354799513974, + "grad_norm": 36.441157466567596, + "learning_rate": 9.157350216771378e-05, + "loss": 1.0897, + "step": 453 + }, + { + "epoch": 1.1020656136087486, + "grad_norm": 32.32587774153429, + "learning_rate": 9.117326188184695e-05, + "loss": 1.1285, + "step": 454 + }, + { + "epoch": 1.1044957472660997, + "grad_norm": 33.39257750037465, + "learning_rate": 9.077316405366981e-05, + "loss": 1.1568, + "step": 455 + }, + { + "epoch": 1.1069258809234508, + "grad_norm": 45.03485873480868, + "learning_rate": 9.037321514049548e-05, + "loss": 1.0791, + "step": 456 + }, + { + "epoch": 1.109356014580802, + "grad_norm": 35.1451377482015, + "learning_rate": 8.997342159723371e-05, + "loss": 1.1243, + "step": 457 + }, + { + "epoch": 1.111786148238153, + "grad_norm": 67.01465976966, + "learning_rate": 8.957378987628682e-05, + "loss": 1.0978, + "step": 458 + }, + { + "epoch": 1.1142162818955041, + "grad_norm": 33.057859846207634, + "learning_rate": 8.917432642744518e-05, + "loss": 1.1431, + "step": 459 + }, + { + "epoch": 1.1166464155528555, + "grad_norm": 30.602840863536635, + "learning_rate": 8.877503769778356e-05, + "loss": 1.1157, + "step": 460 + }, + { + "epoch": 1.1190765492102066, + "grad_norm": 38.088467248288964, + "learning_rate": 8.83759301315568e-05, + "loss": 1.0776, + "step": 461 + }, + { + "epoch": 1.1215066828675577, + "grad_norm": 66.03671829863266, + "learning_rate": 8.797701017009591e-05, + "loss": 1.1468, + "step": 462 + }, + { + "epoch": 1.1239368165249088, + "grad_norm": 32.293691874682686, + "learning_rate": 8.757828425170404e-05, + "loss": 1.1115, + "step": 463 + }, + { + "epoch": 1.12636695018226, + "grad_norm": 32.70707175332633, + "learning_rate": 8.717975881155261e-05, + "loss": 1.1677, + "step": 464 + }, + { + "epoch": 1.128797083839611, + "grad_norm": 48.79069594971439, + "learning_rate": 8.678144028157759e-05, + "loss": 1.1341, + "step": 465 + }, + { + "epoch": 1.1312272174969624, + "grad_norm": 37.52808559072613, + "learning_rate": 8.638333509037536e-05, + "loss": 1.1414, + "step": 466 + }, + { + "epoch": 1.1336573511543135, + "grad_norm": 27.096068124970536, + "learning_rate": 8.598544966309925e-05, + "loss": 1.1719, + "step": 467 + }, + { + "epoch": 1.1360874848116647, + "grad_norm": 16.019227077248434, + "learning_rate": 8.55877904213558e-05, + "loss": 1.1148, + "step": 468 + }, + { + "epoch": 1.1385176184690158, + "grad_norm": 29.861941956913498, + "learning_rate": 8.519036378310096e-05, + "loss": 1.1486, + "step": 469 + }, + { + "epoch": 1.140947752126367, + "grad_norm": 23.058998452019107, + "learning_rate": 8.47931761625367e-05, + "loss": 1.0745, + "step": 470 + }, + { + "epoch": 1.143377885783718, + "grad_norm": 24.486692418227875, + "learning_rate": 8.43962339700073e-05, + "loss": 1.1333, + "step": 471 + }, + { + "epoch": 1.1458080194410694, + "grad_norm": 31.632544516924323, + "learning_rate": 8.399954361189615e-05, + "loss": 1.1565, + "step": 472 + }, + { + "epoch": 1.1482381530984205, + "grad_norm": 21.67735267443374, + "learning_rate": 8.360311149052205e-05, + "loss": 1.109, + "step": 473 + }, + { + "epoch": 1.1506682867557716, + "grad_norm": 29.096918560226527, + "learning_rate": 8.320694400403606e-05, + "loss": 1.1517, + "step": 474 + }, + { + "epoch": 1.1530984204131227, + "grad_norm": 46.067313216206955, + "learning_rate": 8.281104754631835e-05, + "loss": 1.1043, + "step": 475 + }, + { + "epoch": 1.1555285540704738, + "grad_norm": 30.84953769166141, + "learning_rate": 8.241542850687465e-05, + "loss": 1.1081, + "step": 476 + }, + { + "epoch": 1.157958687727825, + "grad_norm": 39.34158523904847, + "learning_rate": 8.20200932707335e-05, + "loss": 1.1787, + "step": 477 + }, + { + "epoch": 1.160388821385176, + "grad_norm": 39.14663302484904, + "learning_rate": 8.162504821834295e-05, + "loss": 1.202, + "step": 478 + }, + { + "epoch": 1.1628189550425274, + "grad_norm": 49.7279004249915, + "learning_rate": 8.123029972546781e-05, + "loss": 1.1439, + "step": 479 + }, + { + "epoch": 1.1652490886998785, + "grad_norm": 35.49897960878779, + "learning_rate": 8.083585416308642e-05, + "loss": 1.0741, + "step": 480 + }, + { + "epoch": 1.1676792223572297, + "grad_norm": 31.306252618855535, + "learning_rate": 8.044171789728816e-05, + "loss": 1.0697, + "step": 481 + }, + { + "epoch": 1.1701093560145808, + "grad_norm": 22.40745672651249, + "learning_rate": 8.004789728917059e-05, + "loss": 1.1498, + "step": 482 + }, + { + "epoch": 1.172539489671932, + "grad_norm": 32.19326746671122, + "learning_rate": 7.965439869473664e-05, + "loss": 1.1392, + "step": 483 + }, + { + "epoch": 1.1749696233292832, + "grad_norm": 33.66876390791385, + "learning_rate": 7.926122846479224e-05, + "loss": 1.1049, + "step": 484 + }, + { + "epoch": 1.1773997569866343, + "grad_norm": 35.43357233261174, + "learning_rate": 7.886839294484377e-05, + "loss": 1.0467, + "step": 485 + }, + { + "epoch": 1.1798298906439855, + "grad_norm": 50.660998166256256, + "learning_rate": 7.84758984749955e-05, + "loss": 1.1244, + "step": 486 + }, + { + "epoch": 1.1822600243013366, + "grad_norm": 41.356845334605936, + "learning_rate": 7.808375138984745e-05, + "loss": 1.1279, + "step": 487 + }, + { + "epoch": 1.1846901579586877, + "grad_norm": 22.947663723281487, + "learning_rate": 7.769195801839313e-05, + "loss": 1.0787, + "step": 488 + }, + { + "epoch": 1.1871202916160388, + "grad_norm": 36.434647074399905, + "learning_rate": 7.730052468391725e-05, + "loss": 1.1148, + "step": 489 + }, + { + "epoch": 1.18955042527339, + "grad_norm": 75.94549877059467, + "learning_rate": 7.690945770389377e-05, + "loss": 1.1127, + "step": 490 + }, + { + "epoch": 1.1919805589307413, + "grad_norm": 68.03126664734435, + "learning_rate": 7.6518763389884e-05, + "loss": 1.1672, + "step": 491 + }, + { + "epoch": 1.1944106925880924, + "grad_norm": 40.15361719091623, + "learning_rate": 7.612844804743466e-05, + "loss": 1.0962, + "step": 492 + }, + { + "epoch": 1.1968408262454435, + "grad_norm": 105.80023571763755, + "learning_rate": 7.573851797597602e-05, + "loss": 1.1091, + "step": 493 + }, + { + "epoch": 1.1992709599027946, + "grad_norm": 41.84401502420881, + "learning_rate": 7.534897946872042e-05, + "loss": 1.1359, + "step": 494 + }, + { + "epoch": 1.2017010935601458, + "grad_norm": 21.985533615468846, + "learning_rate": 7.495983881256067e-05, + "loss": 1.1024, + "step": 495 + }, + { + "epoch": 1.2041312272174969, + "grad_norm": 23.02649898605792, + "learning_rate": 7.457110228796838e-05, + "loss": 1.1089, + "step": 496 + }, + { + "epoch": 1.206561360874848, + "grad_norm": 74.4950498938832, + "learning_rate": 7.418277616889282e-05, + "loss": 1.0439, + "step": 497 + }, + { + "epoch": 1.2089914945321993, + "grad_norm": 27.637660484960865, + "learning_rate": 7.379486672265964e-05, + "loss": 1.1453, + "step": 498 + }, + { + "epoch": 1.2114216281895505, + "grad_norm": 34.98561655821008, + "learning_rate": 7.340738020986961e-05, + "loss": 1.139, + "step": 499 + }, + { + "epoch": 1.2138517618469016, + "grad_norm": 28.47627677351389, + "learning_rate": 7.302032288429756e-05, + "loss": 1.0623, + "step": 500 + }, + { + "epoch": 1.2162818955042527, + "grad_norm": 39.551486186427596, + "learning_rate": 7.263370099279172e-05, + "loss": 1.1277, + "step": 501 + }, + { + "epoch": 1.2187120291616038, + "grad_norm": 44.12973085459368, + "learning_rate": 7.224752077517253e-05, + "loss": 1.1768, + "step": 502 + }, + { + "epoch": 1.2211421628189552, + "grad_norm": 84.84836585196132, + "learning_rate": 7.186178846413214e-05, + "loss": 1.1892, + "step": 503 + }, + { + "epoch": 1.2235722964763063, + "grad_norm": 34.94807915131505, + "learning_rate": 7.147651028513383e-05, + "loss": 1.1108, + "step": 504 + }, + { + "epoch": 1.2260024301336574, + "grad_norm": 46.19847384406232, + "learning_rate": 7.109169245631149e-05, + "loss": 1.0956, + "step": 505 + }, + { + "epoch": 1.2284325637910085, + "grad_norm": 38.58484473058957, + "learning_rate": 7.070734118836925e-05, + "loss": 1.1175, + "step": 506 + }, + { + "epoch": 1.2308626974483596, + "grad_norm": 37.84739298111386, + "learning_rate": 7.032346268448118e-05, + "loss": 1.1411, + "step": 507 + }, + { + "epoch": 1.2332928311057108, + "grad_norm": 53.5471335398439, + "learning_rate": 6.994006314019141e-05, + "loss": 1.1332, + "step": 508 + }, + { + "epoch": 1.2357229647630619, + "grad_norm": 91.55067777365485, + "learning_rate": 6.955714874331387e-05, + "loss": 1.1205, + "step": 509 + }, + { + "epoch": 1.2381530984204132, + "grad_norm": 27.05333642785952, + "learning_rate": 6.917472567383252e-05, + "loss": 1.099, + "step": 510 + }, + { + "epoch": 1.2405832320777643, + "grad_norm": 24.519879042487336, + "learning_rate": 6.87928001038017e-05, + "loss": 1.1401, + "step": 511 + }, + { + "epoch": 1.2430133657351154, + "grad_norm": 33.763495598365786, + "learning_rate": 6.84113781972464e-05, + "loss": 1.2058, + "step": 512 + }, + { + "epoch": 1.2454434993924666, + "grad_norm": 34.49114206138826, + "learning_rate": 6.803046611006278e-05, + "loss": 1.1044, + "step": 513 + }, + { + "epoch": 1.2478736330498177, + "grad_norm": 74.20211157975073, + "learning_rate": 6.765006998991888e-05, + "loss": 1.111, + "step": 514 + }, + { + "epoch": 1.250303766707169, + "grad_norm": 32.30436806042553, + "learning_rate": 6.727019597615545e-05, + "loss": 1.1063, + "step": 515 + }, + { + "epoch": 1.250303766707169, + "eval_loss": 1.1128273010253906, + "eval_runtime": 53.4998, + "eval_samples_per_second": 13.907, + "eval_steps_per_second": 1.738, + "step": 515 + }, + { + "epoch": 1.25273390036452, + "grad_norm": 42.104054612880084, + "learning_rate": 6.689085019968669e-05, + "loss": 1.1315, + "step": 516 + }, + { + "epoch": 1.2551640340218713, + "grad_norm": 25.66097714624212, + "learning_rate": 6.651203878290139e-05, + "loss": 1.0916, + "step": 517 + }, + { + "epoch": 1.2575941676792224, + "grad_norm": 35.12310576456352, + "learning_rate": 6.613376783956423e-05, + "loss": 1.0699, + "step": 518 + }, + { + "epoch": 1.2600243013365735, + "grad_norm": 34.172951559594566, + "learning_rate": 6.575604347471695e-05, + "loss": 1.1412, + "step": 519 + }, + { + "epoch": 1.2624544349939246, + "grad_norm": 54.373563773275116, + "learning_rate": 6.537887178457984e-05, + "loss": 1.1255, + "step": 520 + }, + { + "epoch": 1.2648845686512757, + "grad_norm": 33.806385046788755, + "learning_rate": 6.500225885645346e-05, + "loss": 1.101, + "step": 521 + }, + { + "epoch": 1.267314702308627, + "grad_norm": 34.17813695957543, + "learning_rate": 6.46262107686203e-05, + "loss": 1.1226, + "step": 522 + }, + { + "epoch": 1.2697448359659782, + "grad_norm": 24.68048087106548, + "learning_rate": 6.425073359024663e-05, + "loss": 1.1787, + "step": 523 + }, + { + "epoch": 1.2721749696233293, + "grad_norm": 32.78749757697808, + "learning_rate": 6.387583338128471e-05, + "loss": 1.0541, + "step": 524 + }, + { + "epoch": 1.2746051032806804, + "grad_norm": 30.906673844090044, + "learning_rate": 6.350151619237488e-05, + "loss": 1.0964, + "step": 525 + }, + { + "epoch": 1.2770352369380316, + "grad_norm": 32.571858392892736, + "learning_rate": 6.312778806474795e-05, + "loss": 1.1251, + "step": 526 + }, + { + "epoch": 1.2794653705953827, + "grad_norm": 43.02428916532565, + "learning_rate": 6.275465503012751e-05, + "loss": 1.0473, + "step": 527 + }, + { + "epoch": 1.2818955042527338, + "grad_norm": 60.93587506764561, + "learning_rate": 6.2382123110633e-05, + "loss": 1.078, + "step": 528 + }, + { + "epoch": 1.2843256379100851, + "grad_norm": 64.6934775930251, + "learning_rate": 6.201019831868208e-05, + "loss": 1.0904, + "step": 529 + }, + { + "epoch": 1.2867557715674363, + "grad_norm": 32.977077613035426, + "learning_rate": 6.16388866568938e-05, + "loss": 1.0705, + "step": 530 + }, + { + "epoch": 1.2891859052247874, + "grad_norm": 28.27407310492513, + "learning_rate": 6.126819411799175e-05, + "loss": 1.1252, + "step": 531 + }, + { + "epoch": 1.2916160388821385, + "grad_norm": 33.73515826089828, + "learning_rate": 6.0898126684707265e-05, + "loss": 1.1262, + "step": 532 + }, + { + "epoch": 1.2940461725394896, + "grad_norm": 25.370361818959903, + "learning_rate": 6.052869032968285e-05, + "loss": 1.0845, + "step": 533 + }, + { + "epoch": 1.296476306196841, + "grad_norm": 37.389287060597105, + "learning_rate": 6.015989101537586e-05, + "loss": 1.1352, + "step": 534 + }, + { + "epoch": 1.2989064398541919, + "grad_norm": 39.04755104008223, + "learning_rate": 5.979173469396227e-05, + "loss": 1.1538, + "step": 535 + }, + { + "epoch": 1.3013365735115432, + "grad_norm": 34.33676719612293, + "learning_rate": 5.9424227307240554e-05, + "loss": 1.1725, + "step": 536 + }, + { + "epoch": 1.3037667071688943, + "grad_norm": 64.66076997769457, + "learning_rate": 5.905737478653572e-05, + "loss": 1.1146, + "step": 537 + }, + { + "epoch": 1.3061968408262454, + "grad_norm": 48.043289790386325, + "learning_rate": 5.8691183052603834e-05, + "loss": 1.1035, + "step": 538 + }, + { + "epoch": 1.3086269744835966, + "grad_norm": 49.08397341659928, + "learning_rate": 5.83256580155362e-05, + "loss": 1.1653, + "step": 539 + }, + { + "epoch": 1.3110571081409477, + "grad_norm": 46.688886812303515, + "learning_rate": 5.796080557466406e-05, + "loss": 1.1328, + "step": 540 + }, + { + "epoch": 1.313487241798299, + "grad_norm": 27.503882325413493, + "learning_rate": 5.7596631618463514e-05, + "loss": 1.1019, + "step": 541 + }, + { + "epoch": 1.3159173754556501, + "grad_norm": 48.88974129574653, + "learning_rate": 5.723314202446026e-05, + "loss": 1.121, + "step": 542 + }, + { + "epoch": 1.3183475091130012, + "grad_norm": 28.105881157995345, + "learning_rate": 5.687034265913485e-05, + "loss": 1.0898, + "step": 543 + }, + { + "epoch": 1.3207776427703524, + "grad_norm": 30.410731278414804, + "learning_rate": 5.6508239377828034e-05, + "loss": 1.07, + "step": 544 + }, + { + "epoch": 1.3232077764277035, + "grad_norm": 38.08324176765882, + "learning_rate": 5.614683802464631e-05, + "loss": 1.1503, + "step": 545 + }, + { + "epoch": 1.3256379100850546, + "grad_norm": 46.28952293745534, + "learning_rate": 5.578614443236738e-05, + "loss": 1.1282, + "step": 546 + }, + { + "epoch": 1.3280680437424057, + "grad_norm": 68.2597453597135, + "learning_rate": 5.542616442234618e-05, + "loss": 1.1373, + "step": 547 + }, + { + "epoch": 1.330498177399757, + "grad_norm": 30.351663825014143, + "learning_rate": 5.5066903804421025e-05, + "loss": 1.1633, + "step": 548 + }, + { + "epoch": 1.3329283110571082, + "grad_norm": 38.2711285636887, + "learning_rate": 5.470836837681954e-05, + "loss": 1.1604, + "step": 549 + }, + { + "epoch": 1.3353584447144593, + "grad_norm": 35.64230091531108, + "learning_rate": 5.4350563926065404e-05, + "loss": 1.0564, + "step": 550 + }, + { + "epoch": 1.3377885783718104, + "grad_norm": 44.869816046925564, + "learning_rate": 5.399349622688479e-05, + "loss": 1.1376, + "step": 551 + }, + { + "epoch": 1.3402187120291615, + "grad_norm": 26.681037126315633, + "learning_rate": 5.3637171042113146e-05, + "loss": 1.0867, + "step": 552 + }, + { + "epoch": 1.3426488456865129, + "grad_norm": 34.6124686262535, + "learning_rate": 5.32815941226022e-05, + "loss": 1.0474, + "step": 553 + }, + { + "epoch": 1.3450789793438638, + "grad_norm": 35.92639009060983, + "learning_rate": 5.2926771207127254e-05, + "loss": 1.0958, + "step": 554 + }, + { + "epoch": 1.3475091130012151, + "grad_norm": 39.08938922562224, + "learning_rate": 5.2572708022294504e-05, + "loss": 1.074, + "step": 555 + }, + { + "epoch": 1.3499392466585662, + "grad_norm": 76.06708166273745, + "learning_rate": 5.2219410282448514e-05, + "loss": 1.0865, + "step": 556 + }, + { + "epoch": 1.3523693803159174, + "grad_norm": 74.14222265654887, + "learning_rate": 5.1866883689580056e-05, + "loss": 1.1567, + "step": 557 + }, + { + "epoch": 1.3547995139732685, + "grad_norm": 34.82441678662901, + "learning_rate": 5.151513393323426e-05, + "loss": 1.0802, + "step": 558 + }, + { + "epoch": 1.3572296476306196, + "grad_norm": 75.53504846566143, + "learning_rate": 5.116416669041843e-05, + "loss": 1.0623, + "step": 559 + }, + { + "epoch": 1.359659781287971, + "grad_norm": 29.423475817434785, + "learning_rate": 5.0813987625510775e-05, + "loss": 1.077, + "step": 560 + }, + { + "epoch": 1.362089914945322, + "grad_norm": 44.607486168434534, + "learning_rate": 5.046460239016879e-05, + "loss": 1.096, + "step": 561 + }, + { + "epoch": 1.3645200486026732, + "grad_norm": 40.684125033315404, + "learning_rate": 5.011601662323807e-05, + "loss": 1.148, + "step": 562 + }, + { + "epoch": 1.3669501822600243, + "grad_norm": 47.33103026318705, + "learning_rate": 4.976823595066128e-05, + "loss": 1.1712, + "step": 563 + }, + { + "epoch": 1.3693803159173754, + "grad_norm": 51.17017845058186, + "learning_rate": 4.9421265985387476e-05, + "loss": 1.1287, + "step": 564 + }, + { + "epoch": 1.3718104495747265, + "grad_norm": 50.76665552103517, + "learning_rate": 4.907511232728145e-05, + "loss": 1.1156, + "step": 565 + }, + { + "epoch": 1.3742405832320777, + "grad_norm": 32.6007633025874, + "learning_rate": 4.872978056303327e-05, + "loss": 1.1477, + "step": 566 + }, + { + "epoch": 1.376670716889429, + "grad_norm": 29.696241441710107, + "learning_rate": 4.8385276266068146e-05, + "loss": 1.0874, + "step": 567 + }, + { + "epoch": 1.37910085054678, + "grad_norm": 58.96613500379004, + "learning_rate": 4.804160499645667e-05, + "loss": 1.0616, + "step": 568 + }, + { + "epoch": 1.3815309842041312, + "grad_norm": 37.104100020310334, + "learning_rate": 4.7698772300824756e-05, + "loss": 1.0878, + "step": 569 + }, + { + "epoch": 1.3839611178614823, + "grad_norm": 51.735902941979305, + "learning_rate": 4.735678371226441e-05, + "loss": 1.0836, + "step": 570 + }, + { + "epoch": 1.3863912515188335, + "grad_norm": 55.49190976804079, + "learning_rate": 4.7015644750244306e-05, + "loss": 1.0473, + "step": 571 + }, + { + "epoch": 1.3888213851761848, + "grad_norm": 34.27972449829039, + "learning_rate": 4.6675360920520625e-05, + "loss": 1.0723, + "step": 572 + }, + { + "epoch": 1.391251518833536, + "grad_norm": 28.508157856527724, + "learning_rate": 4.6335937715048306e-05, + "loss": 1.0723, + "step": 573 + }, + { + "epoch": 1.393681652490887, + "grad_norm": 106.84009565003795, + "learning_rate": 4.599738061189244e-05, + "loss": 1.149, + "step": 574 + }, + { + "epoch": 1.3961117861482382, + "grad_norm": 50.543394606036294, + "learning_rate": 4.565969507513981e-05, + "loss": 1.0991, + "step": 575 + }, + { + "epoch": 1.3985419198055893, + "grad_norm": 30.409124335052745, + "learning_rate": 4.532288655481062e-05, + "loss": 1.1157, + "step": 576 + }, + { + "epoch": 1.4009720534629404, + "grad_norm": 89.92061876679301, + "learning_rate": 4.498696048677059e-05, + "loss": 1.1526, + "step": 577 + }, + { + "epoch": 1.4034021871202915, + "grad_norm": 84.27775422110602, + "learning_rate": 4.465192229264337e-05, + "loss": 1.1418, + "step": 578 + }, + { + "epoch": 1.4058323207776429, + "grad_norm": 40.7815489623743, + "learning_rate": 4.4317777379722866e-05, + "loss": 1.0831, + "step": 579 + }, + { + "epoch": 1.408262454434994, + "grad_norm": 66.6911504313278, + "learning_rate": 4.3984531140885943e-05, + "loss": 1.1088, + "step": 580 + }, + { + "epoch": 1.410692588092345, + "grad_norm": 137.00882181835217, + "learning_rate": 4.365218895450558e-05, + "loss": 1.1089, + "step": 581 + }, + { + "epoch": 1.4131227217496962, + "grad_norm": 41.139168895296855, + "learning_rate": 4.332075618436386e-05, + "loss": 1.1603, + "step": 582 + }, + { + "epoch": 1.4155528554070473, + "grad_norm": 35.443969765428506, + "learning_rate": 4.29902381795655e-05, + "loss": 1.0301, + "step": 583 + }, + { + "epoch": 1.4179829890643987, + "grad_norm": 32.931514576694674, + "learning_rate": 4.266064027445155e-05, + "loss": 1.1016, + "step": 584 + }, + { + "epoch": 1.4204131227217496, + "grad_norm": 64.21015694858382, + "learning_rate": 4.2331967788513295e-05, + "loss": 1.0789, + "step": 585 + }, + { + "epoch": 1.422843256379101, + "grad_norm": 84.13251752827094, + "learning_rate": 4.200422602630629e-05, + "loss": 1.1573, + "step": 586 + }, + { + "epoch": 1.425273390036452, + "grad_norm": 53.61636603108024, + "learning_rate": 4.167742027736482e-05, + "loss": 1.0942, + "step": 587 + }, + { + "epoch": 1.4277035236938032, + "grad_norm": 133.20877569415256, + "learning_rate": 4.135155581611661e-05, + "loss": 1.0877, + "step": 588 + }, + { + "epoch": 1.4301336573511543, + "grad_norm": 49.85736467319357, + "learning_rate": 4.102663790179764e-05, + "loss": 1.0619, + "step": 589 + }, + { + "epoch": 1.4325637910085054, + "grad_norm": 91.13217639524017, + "learning_rate": 4.070267177836712e-05, + "loss": 1.1093, + "step": 590 + }, + { + "epoch": 1.4349939246658567, + "grad_norm": 49.25558128250457, + "learning_rate": 4.037966267442315e-05, + "loss": 1.1344, + "step": 591 + }, + { + "epoch": 1.4374240583232079, + "grad_norm": 95.87244356130316, + "learning_rate": 4.005761580311805e-05, + "loss": 1.0929, + "step": 592 + }, + { + "epoch": 1.439854191980559, + "grad_norm": 74.28903671045653, + "learning_rate": 3.973653636207437e-05, + "loss": 1.1263, + "step": 593 + }, + { + "epoch": 1.44228432563791, + "grad_norm": 53.99454529785116, + "learning_rate": 3.941642953330103e-05, + "loss": 1.0916, + "step": 594 + }, + { + "epoch": 1.4447144592952612, + "grad_norm": 113.26015597338959, + "learning_rate": 3.909730048310962e-05, + "loss": 1.1009, + "step": 595 + }, + { + "epoch": 1.4471445929526123, + "grad_norm": 134.4015550981493, + "learning_rate": 3.8779154362030986e-05, + "loss": 1.1351, + "step": 596 + }, + { + "epoch": 1.4495747266099634, + "grad_norm": 90.61611981238187, + "learning_rate": 3.846199630473216e-05, + "loss": 1.0827, + "step": 597 + }, + { + "epoch": 1.4520048602673148, + "grad_norm": 56.55050791518521, + "learning_rate": 3.814583142993352e-05, + "loss": 1.1145, + "step": 598 + }, + { + "epoch": 1.454434993924666, + "grad_norm": 265.6916535243014, + "learning_rate": 3.7830664840326145e-05, + "loss": 1.1459, + "step": 599 + }, + { + "epoch": 1.456865127582017, + "grad_norm": 72.81191101030372, + "learning_rate": 3.7516501622489367e-05, + "loss": 1.0903, + "step": 600 + }, + { + "epoch": 1.4592952612393681, + "grad_norm": 58.309143549086556, + "learning_rate": 3.720334684680889e-05, + "loss": 1.1041, + "step": 601 + }, + { + "epoch": 1.4617253948967193, + "grad_norm": 35.19205741792398, + "learning_rate": 3.689120556739475e-05, + "loss": 1.1523, + "step": 602 + }, + { + "epoch": 1.4641555285540706, + "grad_norm": 88.97226951757321, + "learning_rate": 3.6580082821999786e-05, + "loss": 1.1117, + "step": 603 + }, + { + "epoch": 1.4665856622114215, + "grad_norm": 64.50873879301322, + "learning_rate": 3.6269983631938475e-05, + "loss": 1.1256, + "step": 604 + }, + { + "epoch": 1.4690157958687728, + "grad_norm": 78.10556611104111, + "learning_rate": 3.596091300200578e-05, + "loss": 1.0834, + "step": 605 + }, + { + "epoch": 1.471445929526124, + "grad_norm": 69.38449946362529, + "learning_rate": 3.565287592039628e-05, + "loss": 1.1026, + "step": 606 + }, + { + "epoch": 1.473876063183475, + "grad_norm": 79.60241521456905, + "learning_rate": 3.534587735862391e-05, + "loss": 1.0456, + "step": 607 + }, + { + "epoch": 1.4763061968408262, + "grad_norm": 89.68581306071424, + "learning_rate": 3.503992227144147e-05, + "loss": 1.0809, + "step": 608 + }, + { + "epoch": 1.4787363304981773, + "grad_norm": 68.570527237558, + "learning_rate": 3.473501559676088e-05, + "loss": 1.0754, + "step": 609 + }, + { + "epoch": 1.4811664641555287, + "grad_norm": 54.94762317625427, + "learning_rate": 3.4431162255573245e-05, + "loss": 1.1751, + "step": 610 + }, + { + "epoch": 1.4835965978128798, + "grad_norm": 109.12821602719706, + "learning_rate": 3.4128367151869714e-05, + "loss": 1.1055, + "step": 611 + }, + { + "epoch": 1.486026731470231, + "grad_norm": 198.79030469542352, + "learning_rate": 3.3826635172562094e-05, + "loss": 1.1369, + "step": 612 + }, + { + "epoch": 1.488456865127582, + "grad_norm": 62.002866716809, + "learning_rate": 3.352597118740404e-05, + "loss": 1.1611, + "step": 613 + }, + { + "epoch": 1.4908869987849331, + "grad_norm": 79.21193137029579, + "learning_rate": 3.3226380048912585e-05, + "loss": 1.1688, + "step": 614 + }, + { + "epoch": 1.4933171324422843, + "grad_norm": 68.6722934326242, + "learning_rate": 3.292786659228973e-05, + "loss": 1.1248, + "step": 615 + }, + { + "epoch": 1.4957472660996354, + "grad_norm": 104.34122241838278, + "learning_rate": 3.263043563534428e-05, + "loss": 1.1425, + "step": 616 + }, + { + "epoch": 1.4981773997569867, + "grad_norm": 86.43862038340298, + "learning_rate": 3.233409197841437e-05, + "loss": 1.0562, + "step": 617 + }, + { + "epoch": 1.5006075334143378, + "grad_norm": 79.74137751394451, + "learning_rate": 3.2038840404289705e-05, + "loss": 1.1214, + "step": 618 + }, + { + "epoch": 1.5006075334143378, + "eval_loss": 1.1088899374008179, + "eval_runtime": 53.0545, + "eval_samples_per_second": 14.023, + "eval_steps_per_second": 1.753, + "step": 618 + } + ], + "logging_steps": 1, + "max_steps": 822, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 206, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.157723878347244e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-618/training_args.bin b/checkpoint-618/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..529c57f6a4b7b9fa2912b10c5ebbd4c9ae92b0f2 --- /dev/null +++ b/checkpoint-618/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6cfbae5f5972dd850bae3d0987f916904b4b5b8d723c11ef16db54c57724a76 +size 8568 diff --git a/checkpoint-618/zero_to_fp32.py b/checkpoint-618/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-618/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-772/README.md b/checkpoint-772/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5d958b0327f8e51ca223e270eb789387f6b41fb2 --- /dev/null +++ b/checkpoint-772/README.md @@ -0,0 +1,202 @@ +--- +base_model: THUDM/GLM-4-32B-0414 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/checkpoint-772/adapter_config.json b/checkpoint-772/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d9d8fa4860138947c736b05e4c3dd010601e2671 --- /dev/null +++ b/checkpoint-772/adapter_config.json @@ -0,0 +1,41 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "THUDM/GLM-4-32B-0414", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "k_proj", + "gate_up_proj", + "down_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": true +} \ No newline at end of file diff --git a/checkpoint-772/adapter_model.safetensors b/checkpoint-772/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..efd4f6d0842ac4eee5367d6307eb60c53f719da1 --- /dev/null +++ b/checkpoint-772/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a429bd0213ee3f039d3495189dd78fc3c718e5e7b9dc021d50db49606805cb5 +size 5579575888 diff --git a/checkpoint-772/global_step772/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-772/global_step772/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e4efed2d9158f5e347589f668179255d748ce3f --- /dev/null +++ b/checkpoint-772/global_step772/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52bf210bfbacef345afbf22fef52f863a0c4dd794b34d8ba6fb4625b51747425 +size 2458601314 diff --git a/checkpoint-772/global_step772/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-772/global_step772/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fdab4a2c4fd26bcc09ebcce62851498b709396ac --- /dev/null +++ b/checkpoint-772/global_step772/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d0ec0fca7fa53b6ebdc629bdb92ce80df032e2245cb222a1d27f4c658c3cce1 +size 2458601314 diff --git a/checkpoint-772/global_step772/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-772/global_step772/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a759ce7a9292456cc182bf48012e5a40d472bb7a --- /dev/null +++ b/checkpoint-772/global_step772/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:779b6930d95dca330aa3cf0c8072e4a0dc3fcc8a1879f06578b790ec8e9785f3 +size 2458601314 diff --git a/checkpoint-772/global_step772/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-772/global_step772/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2cf1ddf420cab05d96e179879577e1380c9c507 --- /dev/null +++ b/checkpoint-772/global_step772/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9235fbd49d0080521cf226ea88045fd4131d451b2f9f65206e2a491bef10633 +size 2458601314 diff --git a/checkpoint-772/global_step772/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-772/global_step772/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..883bfa0675976c5bfbc2a48a7adc7565d4917e6d --- /dev/null +++ b/checkpoint-772/global_step772/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8608560fe69cbbee152b21c18551b87860579716512b78189d5b6f54fe38381b +size 2458601314 diff --git a/checkpoint-772/global_step772/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-772/global_step772/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd0f103692fe42c845fa4984297d83e079fde655 --- /dev/null +++ b/checkpoint-772/global_step772/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4688795cec4d22423afd1a59ecde6329428f4a7767c23d7b69cac6470de36e84 +size 2458601314 diff --git a/checkpoint-772/global_step772/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-772/global_step772/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9e563b0900546dc3cb5fedb972e14dac98e9f26 --- /dev/null +++ b/checkpoint-772/global_step772/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad73a534b1a7b01cb75d306d2485eba0701a079986666dc31b518895a8e31494 +size 2458601314 diff --git a/checkpoint-772/global_step772/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-772/global_step772/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00e956b249dc8f7c960fa4edc9b6e5e2ddc2aa68 --- /dev/null +++ b/checkpoint-772/global_step772/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1136fb48987f32a04186db2b48ff9a8e56d87c14dc9fb5910b490500b78a1808 +size 2458601314 diff --git a/checkpoint-772/global_step772/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-772/global_step772/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7bccd5d3f2e4a5344a8ab671ba9e574655d48069 --- /dev/null +++ b/checkpoint-772/global_step772/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b978b747317a3b603149b9fd0f9a2890bbde79d056e71d33616c49732c2a400 +size 747668 diff --git a/checkpoint-772/global_step772/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-772/global_step772/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..38fdddfb5369b380374aea97240d35c142317877 --- /dev/null +++ b/checkpoint-772/global_step772/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a5de312ccaa1b40bf0b2e53f82e4af34f398cd675ae82c42ca1e5328ce1475a +size 747668 diff --git a/checkpoint-772/global_step772/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-772/global_step772/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f56495a0e7653e0375a21dbfb44840faf342d137 --- /dev/null +++ b/checkpoint-772/global_step772/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2088a9c158a8e29039f2d25c01ca5831bbd50ae7f05818eee42ea02f3556dc +size 747668 diff --git a/checkpoint-772/global_step772/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-772/global_step772/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..411dcbfd58343a708c500a7b991957fefd2f5baf --- /dev/null +++ b/checkpoint-772/global_step772/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ec4aa947bc922c74815c7db480fe0dbd4f7991f2b141f1252b3593f6ad7ee71 +size 747668 diff --git a/checkpoint-772/global_step772/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-772/global_step772/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..401331e9ba26f657834656a20c84d359573d7a06 --- /dev/null +++ b/checkpoint-772/global_step772/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9264b08d23fc9636ea17bcc8238a8cfd1c7064a22711c1557e7ec0708174988c +size 747668 diff --git a/checkpoint-772/global_step772/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-772/global_step772/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4dec1589e5d65e770275fb05d448b576f87a19b2 --- /dev/null +++ b/checkpoint-772/global_step772/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ae179cada4a016a1aa2818065da449ae275da7e5d76cd91a8ab3c9d294de2ae +size 747668 diff --git a/checkpoint-772/global_step772/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-772/global_step772/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2b96f0f56fbd4b86e69f48a7b27687b75e9105e --- /dev/null +++ b/checkpoint-772/global_step772/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e67ae6578c5308d2ba390277dc3e8885b9cd57f62550755b8f5648dcd3a6b3be +size 747668 diff --git a/checkpoint-772/global_step772/zero_pp_rank_7_mp_rank_00_model_states.pt b/checkpoint-772/global_step772/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cdbec0ec4a1b8ce9f9b78b0442fc58c6e80601e4 --- /dev/null +++ b/checkpoint-772/global_step772/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7eba29263e25cb4d1647a7500cf93bc0158d98d6403828e7794d0bc6c78af264 +size 747668 diff --git a/checkpoint-772/latest b/checkpoint-772/latest new file mode 100644 index 0000000000000000000000000000000000000000..5bbd92f860917d9be16238187bc2f86bba0c5670 --- /dev/null +++ b/checkpoint-772/latest @@ -0,0 +1 @@ +global_step772 \ No newline at end of file diff --git a/checkpoint-772/rng_state_0.pth b/checkpoint-772/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..9ee67a9fe52438bf9b329a6ee618dfda99e3f467 --- /dev/null +++ b/checkpoint-772/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18566e96d351f85c9faf808d2b8e8b090ac0eebabafe863d5320bf7cc2562e69 +size 15984 diff --git a/checkpoint-772/rng_state_1.pth b/checkpoint-772/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d4331d73f7582d82d99fa612f1d416646c40ce7 --- /dev/null +++ b/checkpoint-772/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ef6e1e354a2761c9dfe8da34c560d5a5ee9fefedac31317c8ff85710de1261b +size 15984 diff --git a/checkpoint-772/rng_state_2.pth b/checkpoint-772/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..16d822119b77a126e5baa615a68181a03d099a7a --- /dev/null +++ b/checkpoint-772/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e5df4b4b7b92851c781ba46584013741a933dde8af0b0cb5c1fa07712e79cc8 +size 15984 diff --git a/checkpoint-772/rng_state_3.pth b/checkpoint-772/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..c98bd3f46a8b7286cc1d121246b38da950881056 --- /dev/null +++ b/checkpoint-772/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b19e8ce609ad14ca28d6ad7eb241877b2d8d1550e78093a062a56bd58615f2cf +size 15984 diff --git a/checkpoint-772/rng_state_4.pth b/checkpoint-772/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..4bfbfc6202d81260748e53fb67a9f4a49020dd28 --- /dev/null +++ b/checkpoint-772/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b57c77e995abe0fba5f3846694b27200af5934217086635b6cb04a2c25be8e3e +size 15984 diff --git a/checkpoint-772/rng_state_5.pth b/checkpoint-772/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..343a53021b34d918a5b8bb0dba622462755bc641 --- /dev/null +++ b/checkpoint-772/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70e287ad85b1923f8fa5b635d0b38e32a77e1bc312a43abc82def3622ed2a6e5 +size 15984 diff --git a/checkpoint-772/rng_state_6.pth b/checkpoint-772/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..f2f477a9676518042d0e60f50bbeef3f682b93e4 --- /dev/null +++ b/checkpoint-772/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:684f4313476ed839fa25cdc36ba6b47f3152341389952bc02b263da4c5ae8f8e +size 15984 diff --git a/checkpoint-772/rng_state_7.pth b/checkpoint-772/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..ccfbc33a0b3571f51e17967b949631ffcefeb919 --- /dev/null +++ b/checkpoint-772/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d969ebb64ba903d248a0b8df9875e21ddc9fbb3219bf580c656a4fc3043e6c9 +size 15984 diff --git a/checkpoint-772/scheduler.pt b/checkpoint-772/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e2c6c76fbd456c67cb8872f27364f51ef0e22313 --- /dev/null +++ b/checkpoint-772/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30d419ec6d40fe5adabb5591a31055f0370131c25b4bfb62a6dbe9046d1d19b5 +size 1064 diff --git a/checkpoint-772/special_tokens_map.json b/checkpoint-772/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3cf953c2c8a3c89778c92e54c685942bb1130616 --- /dev/null +++ b/checkpoint-772/special_tokens_map.json @@ -0,0 +1,32 @@ +{ + "additional_special_tokens": [ + "<|endoftext|>", + "[MASK]", + "[gMASK]", + "[sMASK]", + "", + "", + "<|system|>", + "<|user|>", + "<|assistant|>", + "<|observation|>", + "<|begin_of_image|>", + "<|end_of_image|>", + "<|begin_of_video|>", + "<|end_of_video|>" + ], + "eos_token": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-772/tokenizer.json b/checkpoint-772/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..4d1dde37a2715c11628fc84bf571976f9f80eb69 --- /dev/null +++ b/checkpoint-772/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76ebeac0d8bd7879ead7b43c16b44981f277e47225de2bd7de9ae1a6cc664a8c +size 19966496 diff --git a/checkpoint-772/tokenizer_config.json b/checkpoint-772/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e19c45f6310a17f6c1c6be76a429ac68438d547f --- /dev/null +++ b/checkpoint-772/tokenizer_config.json @@ -0,0 +1,146 @@ +{ + "added_tokens_decoder": { + "151329": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151330": { + "content": "[MASK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151331": { + "content": "[gMASK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151332": { + "content": "[sMASK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151333": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151334": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151335": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151336": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151337": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151338": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151339": { + "content": "<|begin_of_image|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151340": { + "content": "<|end_of_image|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151341": { + "content": "<|begin_of_video|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151342": { + "content": "<|end_of_video|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|endoftext|>", + "[MASK]", + "[gMASK]", + "[sMASK]", + "", + "", + "<|system|>", + "<|user|>", + "<|assistant|>", + "<|observation|>", + "<|begin_of_image|>", + "<|end_of_image|>", + "<|begin_of_video|>", + "<|end_of_video|>" + ], + "chat_template": "[gMASK]\n{%- if tools -%}\n<|system|>\n# 可用工具\n{% for tool in tools %}\n {%- set function = tool.function if tool.get(\"function\") else tool %}\n\n## {{ function.name }}\n\n{{ function | tojson(indent=4, ensure_ascii=False) }}\n在调用上述函数时,请使用 Json 格式表示调用的参数。\n{%- endfor %}\n{%- endif -%}\n\n{%- for msg in messages %}\n {%- if msg.role == 'system' %}\n<|system|>\n{{ msg.content }}\n {%- endif %}\n{%- endfor %}\n\n{%- for message in messages if message.role != 'system' %}\n {%- set role = message['role'] %}\n {%- set content = message['content'] %}\n {%- set meta = message.get(\"metadata\", \"\") %}\n\n {%- if role == 'user' %}\n<|user|>\n{{ content }}\n {%- elif role == 'assistant' and not meta %}\n<|assistant|>\n{{ content }}\n {%- elif role == 'assistant' and meta %}\n<|assistant|>{{ meta }}\n{{ content }}\n {%- elif role == 'observation' %}\n<|observation|>\n{{ content }}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}<|assistant|>{% endif %}", + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "eos_token": "<|user|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 128000, + "pad_token": "<|endoftext|>", + "padding_side": "left", + "remove_space": false, + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-772/trainer_state.json b/checkpoint-772/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..821e291800d05f5a6fc85764080147c4a6f5aab4 --- /dev/null +++ b/checkpoint-772/trainer_state.json @@ -0,0 +1,5502 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 97, + "global_step": 772, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0025906735751295338, + "grad_norm": 758.2562349755826, + "learning_rate": 0.0, + "loss": 1.3719, + "step": 1 + }, + { + "epoch": 0.0025906735751295338, + "eval_loss": 1.3159157037734985, + "eval_runtime": 36.907, + "eval_samples_per_second": 20.159, + "eval_steps_per_second": 1.273, + "step": 1 + }, + { + "epoch": 0.0051813471502590676, + "grad_norm": 666.308184823038, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.36, + "step": 2 + }, + { + "epoch": 0.007772020725388601, + "grad_norm": 211.0771195353068, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.3746, + "step": 3 + }, + { + "epoch": 0.010362694300518135, + "grad_norm": 431.5114709683218, + "learning_rate": 3e-06, + "loss": 1.3412, + "step": 4 + }, + { + "epoch": 0.012953367875647668, + "grad_norm": 230.87468433791625, + "learning_rate": 4.000000000000001e-06, + "loss": 1.3837, + "step": 5 + }, + { + "epoch": 0.015544041450777202, + "grad_norm": 635.1636587738542, + "learning_rate": 5e-06, + "loss": 1.3761, + "step": 6 + }, + { + "epoch": 0.018134715025906734, + "grad_norm": 791.5536958334704, + "learning_rate": 6e-06, + "loss": 1.2855, + "step": 7 + }, + { + "epoch": 0.02072538860103627, + "grad_norm": 667.7197994216477, + "learning_rate": 7e-06, + "loss": 1.3267, + "step": 8 + }, + { + "epoch": 0.023316062176165803, + "grad_norm": 254.3855973692125, + "learning_rate": 8.000000000000001e-06, + "loss": 1.2977, + "step": 9 + }, + { + "epoch": 0.025906735751295335, + "grad_norm": 162.29347257682093, + "learning_rate": 9e-06, + "loss": 1.3522, + "step": 10 + }, + { + "epoch": 0.02849740932642487, + "grad_norm": 352.6352930651456, + "learning_rate": 1e-05, + "loss": 1.2688, + "step": 11 + }, + { + "epoch": 0.031088082901554404, + "grad_norm": 148.2629265526552, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.3342, + "step": 12 + }, + { + "epoch": 0.03367875647668394, + "grad_norm": 249.88753789723657, + "learning_rate": 1.2e-05, + "loss": 1.2983, + "step": 13 + }, + { + "epoch": 0.03626943005181347, + "grad_norm": 184.03358422636597, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.3291, + "step": 14 + }, + { + "epoch": 0.038860103626943004, + "grad_norm": 198.4491469860763, + "learning_rate": 1.4e-05, + "loss": 1.4014, + "step": 15 + }, + { + "epoch": 0.04145077720207254, + "grad_norm": 680.9537058769038, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.3775, + "step": 16 + }, + { + "epoch": 0.04404145077720207, + "grad_norm": 563.0247638614801, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.3228, + "step": 17 + }, + { + "epoch": 0.046632124352331605, + "grad_norm": 271.985463813746, + "learning_rate": 1.7e-05, + "loss": 1.3695, + "step": 18 + }, + { + "epoch": 0.04922279792746114, + "grad_norm": 399.51218452223316, + "learning_rate": 1.8e-05, + "loss": 1.2556, + "step": 19 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 160.70697055826656, + "learning_rate": 1.9e-05, + "loss": 1.2982, + "step": 20 + }, + { + "epoch": 0.054404145077720206, + "grad_norm": 227.8927504687491, + "learning_rate": 2e-05, + "loss": 1.3532, + "step": 21 + }, + { + "epoch": 0.05699481865284974, + "grad_norm": 550.1538868076032, + "learning_rate": 2.1000000000000002e-05, + "loss": 1.2603, + "step": 22 + }, + { + "epoch": 0.05958549222797927, + "grad_norm": 291.8994359919024, + "learning_rate": 2.2000000000000003e-05, + "loss": 1.3663, + "step": 23 + }, + { + "epoch": 0.06217616580310881, + "grad_norm": 120.60677833129643, + "learning_rate": 2.3e-05, + "loss": 1.3129, + "step": 24 + }, + { + "epoch": 0.06476683937823834, + "grad_norm": 414.4006662101242, + "learning_rate": 2.4e-05, + "loss": 1.3037, + "step": 25 + }, + { + "epoch": 0.06735751295336788, + "grad_norm": 141.48324465317884, + "learning_rate": 2.5e-05, + "loss": 1.3095, + "step": 26 + }, + { + "epoch": 0.06994818652849741, + "grad_norm": 147.86066819937994, + "learning_rate": 2.6000000000000002e-05, + "loss": 1.2372, + "step": 27 + }, + { + "epoch": 0.07253886010362694, + "grad_norm": 214.47337614964576, + "learning_rate": 2.7000000000000002e-05, + "loss": 1.3384, + "step": 28 + }, + { + "epoch": 0.07512953367875648, + "grad_norm": 898.4324889241673, + "learning_rate": 2.8e-05, + "loss": 1.2003, + "step": 29 + }, + { + "epoch": 0.07772020725388601, + "grad_norm": 128.83026557596128, + "learning_rate": 2.9e-05, + "loss": 1.2172, + "step": 30 + }, + { + "epoch": 0.08031088082901554, + "grad_norm": 183.0777862405529, + "learning_rate": 3.0000000000000004e-05, + "loss": 1.2674, + "step": 31 + }, + { + "epoch": 0.08290155440414508, + "grad_norm": 119.01841833358732, + "learning_rate": 3.1e-05, + "loss": 1.2554, + "step": 32 + }, + { + "epoch": 0.08549222797927461, + "grad_norm": 117.65980267542858, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.2716, + "step": 33 + }, + { + "epoch": 0.08808290155440414, + "grad_norm": 82.40151099433953, + "learning_rate": 3.3e-05, + "loss": 1.2019, + "step": 34 + }, + { + "epoch": 0.09067357512953368, + "grad_norm": 82.61816783653785, + "learning_rate": 3.4e-05, + "loss": 1.2424, + "step": 35 + }, + { + "epoch": 0.09326424870466321, + "grad_norm": 136.42743433868276, + "learning_rate": 3.5000000000000004e-05, + "loss": 1.2066, + "step": 36 + }, + { + "epoch": 0.09585492227979274, + "grad_norm": 36.775911657584444, + "learning_rate": 3.6e-05, + "loss": 1.2485, + "step": 37 + }, + { + "epoch": 0.09844559585492228, + "grad_norm": 56.55022603284064, + "learning_rate": 3.7000000000000005e-05, + "loss": 1.2112, + "step": 38 + }, + { + "epoch": 0.10103626943005181, + "grad_norm": 50.09896932886107, + "learning_rate": 3.8e-05, + "loss": 1.2027, + "step": 39 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 54.2661481198025, + "learning_rate": 3.9e-05, + "loss": 1.2673, + "step": 40 + }, + { + "epoch": 0.10621761658031088, + "grad_norm": 60.04145981731815, + "learning_rate": 4e-05, + "loss": 1.1648, + "step": 41 + }, + { + "epoch": 0.10880829015544041, + "grad_norm": 169.47741055545822, + "learning_rate": 3.999981580539036e-05, + "loss": 1.2393, + "step": 42 + }, + { + "epoch": 0.11139896373056994, + "grad_norm": 43.64716987307323, + "learning_rate": 3.9999263224954204e-05, + "loss": 1.2906, + "step": 43 + }, + { + "epoch": 0.11398963730569948, + "grad_norm": 51.3206609767585, + "learning_rate": 3.999834226886976e-05, + "loss": 1.1807, + "step": 44 + }, + { + "epoch": 0.11658031088082901, + "grad_norm": 38.95055887413869, + "learning_rate": 3.999705295410054e-05, + "loss": 1.1825, + "step": 45 + }, + { + "epoch": 0.11917098445595854, + "grad_norm": 40.59968974426338, + "learning_rate": 3.999539530439504e-05, + "loss": 1.193, + "step": 46 + }, + { + "epoch": 0.12176165803108809, + "grad_norm": 34.5796571445333, + "learning_rate": 3.9993369350286265e-05, + "loss": 1.2127, + "step": 47 + }, + { + "epoch": 0.12435233160621761, + "grad_norm": 37.97693356149241, + "learning_rate": 3.99909751290912e-05, + "loss": 1.1543, + "step": 48 + }, + { + "epoch": 0.12694300518134716, + "grad_norm": 82.9217015858092, + "learning_rate": 3.9988212684910107e-05, + "loss": 1.2329, + "step": 49 + }, + { + "epoch": 0.12953367875647667, + "grad_norm": 49.256542144400214, + "learning_rate": 3.9985082068625724e-05, + "loss": 1.212, + "step": 50 + }, + { + "epoch": 0.13212435233160622, + "grad_norm": 45.025980435259484, + "learning_rate": 3.998158333790231e-05, + "loss": 1.2129, + "step": 51 + }, + { + "epoch": 0.13471502590673576, + "grad_norm": 45.98465689592428, + "learning_rate": 3.99777165571846e-05, + "loss": 1.1709, + "step": 52 + }, + { + "epoch": 0.13730569948186527, + "grad_norm": 43.481241408477906, + "learning_rate": 3.997348179769661e-05, + "loss": 1.1614, + "step": 53 + }, + { + "epoch": 0.13989637305699482, + "grad_norm": 82.17633750834132, + "learning_rate": 3.996887913744033e-05, + "loss": 1.2205, + "step": 54 + }, + { + "epoch": 0.14248704663212436, + "grad_norm": 53.0176514970764, + "learning_rate": 3.9963908661194285e-05, + "loss": 1.1204, + "step": 55 + }, + { + "epoch": 0.14507772020725387, + "grad_norm": 67.86382426995611, + "learning_rate": 3.995857046051196e-05, + "loss": 1.1839, + "step": 56 + }, + { + "epoch": 0.14766839378238342, + "grad_norm": 31.282407703790597, + "learning_rate": 3.995286463372013e-05, + "loss": 1.2126, + "step": 57 + }, + { + "epoch": 0.15025906735751296, + "grad_norm": 52.200764429265604, + "learning_rate": 3.994679128591706e-05, + "loss": 1.2036, + "step": 58 + }, + { + "epoch": 0.15284974093264247, + "grad_norm": 60.706608653531895, + "learning_rate": 3.9940350528970535e-05, + "loss": 1.1848, + "step": 59 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 47.31754062899529, + "learning_rate": 3.993354248151583e-05, + "loss": 1.0869, + "step": 60 + }, + { + "epoch": 0.15803108808290156, + "grad_norm": 49.42450836392811, + "learning_rate": 3.9926367268953514e-05, + "loss": 1.2651, + "step": 61 + }, + { + "epoch": 0.16062176165803108, + "grad_norm": 38.791167030088886, + "learning_rate": 3.991882502344712e-05, + "loss": 1.1881, + "step": 62 + }, + { + "epoch": 0.16321243523316062, + "grad_norm": 56.16339499737216, + "learning_rate": 3.991091588392077e-05, + "loss": 1.1518, + "step": 63 + }, + { + "epoch": 0.16580310880829016, + "grad_norm": 861.8559063020828, + "learning_rate": 3.990263999605652e-05, + "loss": 1.1614, + "step": 64 + }, + { + "epoch": 0.16839378238341968, + "grad_norm": 50.92822786500888, + "learning_rate": 3.989399751229179e-05, + "loss": 1.1998, + "step": 65 + }, + { + "epoch": 0.17098445595854922, + "grad_norm": 31.04121324055666, + "learning_rate": 3.988498859181645e-05, + "loss": 1.1795, + "step": 66 + }, + { + "epoch": 0.17357512953367876, + "grad_norm": 50.33061983380845, + "learning_rate": 3.9875613400569975e-05, + "loss": 1.1742, + "step": 67 + }, + { + "epoch": 0.17616580310880828, + "grad_norm": 75.20462514003519, + "learning_rate": 3.986587211123833e-05, + "loss": 1.1856, + "step": 68 + }, + { + "epoch": 0.17875647668393782, + "grad_norm": 38.82139317052205, + "learning_rate": 3.98557649032508e-05, + "loss": 1.1529, + "step": 69 + }, + { + "epoch": 0.18134715025906736, + "grad_norm": 36.55988806615175, + "learning_rate": 3.984529196277674e-05, + "loss": 1.1884, + "step": 70 + }, + { + "epoch": 0.18393782383419688, + "grad_norm": 104.8931793971097, + "learning_rate": 3.983445348272203e-05, + "loss": 1.2182, + "step": 71 + }, + { + "epoch": 0.18652849740932642, + "grad_norm": 36.50395409234617, + "learning_rate": 3.982324966272566e-05, + "loss": 1.1609, + "step": 72 + }, + { + "epoch": 0.18911917098445596, + "grad_norm": 35.019191693448626, + "learning_rate": 3.981168070915594e-05, + "loss": 1.173, + "step": 73 + }, + { + "epoch": 0.19170984455958548, + "grad_norm": 33.378390048053596, + "learning_rate": 3.979974683510677e-05, + "loss": 1.173, + "step": 74 + }, + { + "epoch": 0.19430051813471502, + "grad_norm": 43.356840136984154, + "learning_rate": 3.978744826039366e-05, + "loss": 1.2032, + "step": 75 + }, + { + "epoch": 0.19689119170984457, + "grad_norm": 31.285725922510768, + "learning_rate": 3.977478521154974e-05, + "loss": 1.1569, + "step": 76 + }, + { + "epoch": 0.19948186528497408, + "grad_norm": 35.19264482867074, + "learning_rate": 3.9761757921821544e-05, + "loss": 1.1365, + "step": 77 + }, + { + "epoch": 0.20207253886010362, + "grad_norm": 44.66037256551279, + "learning_rate": 3.974836663116472e-05, + "loss": 1.164, + "step": 78 + }, + { + "epoch": 0.20466321243523317, + "grad_norm": 68.91101457952654, + "learning_rate": 3.973461158623963e-05, + "loss": 1.2256, + "step": 79 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 45.866521854583, + "learning_rate": 3.9720493040406786e-05, + "loss": 1.1697, + "step": 80 + }, + { + "epoch": 0.20984455958549222, + "grad_norm": 59.63095169617338, + "learning_rate": 3.970601125372218e-05, + "loss": 1.2094, + "step": 81 + }, + { + "epoch": 0.21243523316062177, + "grad_norm": 39.085597271064216, + "learning_rate": 3.9691166492932535e-05, + "loss": 1.1048, + "step": 82 + }, + { + "epoch": 0.21502590673575128, + "grad_norm": 36.40256073477861, + "learning_rate": 3.9675959031470336e-05, + "loss": 1.248, + "step": 83 + }, + { + "epoch": 0.21761658031088082, + "grad_norm": 29.846921716586085, + "learning_rate": 3.966038914944881e-05, + "loss": 1.1718, + "step": 84 + }, + { + "epoch": 0.22020725388601037, + "grad_norm": 50.87052190327881, + "learning_rate": 3.964445713365682e-05, + "loss": 1.1529, + "step": 85 + }, + { + "epoch": 0.22279792746113988, + "grad_norm": 35.32915760431302, + "learning_rate": 3.9628163277553486e-05, + "loss": 1.1767, + "step": 86 + }, + { + "epoch": 0.22538860103626943, + "grad_norm": 157.5587514654703, + "learning_rate": 3.961150788126286e-05, + "loss": 1.2194, + "step": 87 + }, + { + "epoch": 0.22797927461139897, + "grad_norm": 25.03485489120971, + "learning_rate": 3.9594491251568376e-05, + "loss": 1.1392, + "step": 88 + }, + { + "epoch": 0.23056994818652848, + "grad_norm": 80.55933867045263, + "learning_rate": 3.957711370190716e-05, + "loss": 1.1819, + "step": 89 + }, + { + "epoch": 0.23316062176165803, + "grad_norm": 272.22874004071406, + "learning_rate": 3.9559375552364325e-05, + "loss": 1.0998, + "step": 90 + }, + { + "epoch": 0.23575129533678757, + "grad_norm": 91.94671663482514, + "learning_rate": 3.954127712966702e-05, + "loss": 1.2494, + "step": 91 + }, + { + "epoch": 0.23834196891191708, + "grad_norm": 54.31533598131098, + "learning_rate": 3.952281876717843e-05, + "loss": 1.1385, + "step": 92 + }, + { + "epoch": 0.24093264248704663, + "grad_norm": 103.20789745908105, + "learning_rate": 3.950400080489165e-05, + "loss": 1.1398, + "step": 93 + }, + { + "epoch": 0.24352331606217617, + "grad_norm": 45.14746362545893, + "learning_rate": 3.94848235894234e-05, + "loss": 1.2697, + "step": 94 + }, + { + "epoch": 0.24611398963730569, + "grad_norm": 21.271923336142002, + "learning_rate": 3.9465287474007654e-05, + "loss": 1.1397, + "step": 95 + }, + { + "epoch": 0.24870466321243523, + "grad_norm": 93.89786795431422, + "learning_rate": 3.944539281848912e-05, + "loss": 1.1542, + "step": 96 + }, + { + "epoch": 0.25129533678756477, + "grad_norm": 32.38768349342839, + "learning_rate": 3.942513998931663e-05, + "loss": 1.1693, + "step": 97 + }, + { + "epoch": 0.25129533678756477, + "eval_loss": 1.1344976425170898, + "eval_runtime": 37.8807, + "eval_samples_per_second": 19.641, + "eval_steps_per_second": 1.241, + "step": 97 + }, + { + "epoch": 0.2538860103626943, + "grad_norm": 91.41293468177638, + "learning_rate": 3.940452935953639e-05, + "loss": 1.1724, + "step": 98 + }, + { + "epoch": 0.25647668393782386, + "grad_norm": 39.20645478419229, + "learning_rate": 3.9383561308785075e-05, + "loss": 1.1583, + "step": 99 + }, + { + "epoch": 0.25906735751295334, + "grad_norm": 35.32804513153546, + "learning_rate": 3.9362236223282885e-05, + "loss": 1.158, + "step": 100 + }, + { + "epoch": 0.2616580310880829, + "grad_norm": 35.24783762804842, + "learning_rate": 3.934055449582641e-05, + "loss": 1.1552, + "step": 101 + }, + { + "epoch": 0.26424870466321243, + "grad_norm": 33.743808031979775, + "learning_rate": 3.931851652578137e-05, + "loss": 1.264, + "step": 102 + }, + { + "epoch": 0.266839378238342, + "grad_norm": 113.49798793226394, + "learning_rate": 3.92961227190753e-05, + "loss": 1.2361, + "step": 103 + }, + { + "epoch": 0.2694300518134715, + "grad_norm": 31.813807349410364, + "learning_rate": 3.9273373488190036e-05, + "loss": 1.1246, + "step": 104 + }, + { + "epoch": 0.27202072538860106, + "grad_norm": 29.391695486306187, + "learning_rate": 3.925026925215417e-05, + "loss": 1.1142, + "step": 105 + }, + { + "epoch": 0.27461139896373055, + "grad_norm": 33.79933331839905, + "learning_rate": 3.922681043653526e-05, + "loss": 1.1401, + "step": 106 + }, + { + "epoch": 0.2772020725388601, + "grad_norm": 39.09509012730907, + "learning_rate": 3.920299747343204e-05, + "loss": 1.1822, + "step": 107 + }, + { + "epoch": 0.27979274611398963, + "grad_norm": 37.81471938433609, + "learning_rate": 3.9178830801466465e-05, + "loss": 1.1592, + "step": 108 + }, + { + "epoch": 0.2823834196891192, + "grad_norm": 69.07753778460207, + "learning_rate": 3.915431086577561e-05, + "loss": 1.1683, + "step": 109 + }, + { + "epoch": 0.2849740932642487, + "grad_norm": 28.864787246081605, + "learning_rate": 3.912943811800347e-05, + "loss": 1.1179, + "step": 110 + }, + { + "epoch": 0.28756476683937826, + "grad_norm": 28.842042951717836, + "learning_rate": 3.910421301629264e-05, + "loss": 1.1317, + "step": 111 + }, + { + "epoch": 0.29015544041450775, + "grad_norm": 51.475482074695506, + "learning_rate": 3.9078636025275904e-05, + "loss": 1.1451, + "step": 112 + }, + { + "epoch": 0.2927461139896373, + "grad_norm": 33.48279556713943, + "learning_rate": 3.9052707616067654e-05, + "loss": 1.1554, + "step": 113 + }, + { + "epoch": 0.29533678756476683, + "grad_norm": 21.279603575929844, + "learning_rate": 3.9026428266255205e-05, + "loss": 1.1636, + "step": 114 + }, + { + "epoch": 0.2979274611398964, + "grad_norm": 36.226178034876675, + "learning_rate": 3.899979845989003e-05, + "loss": 1.1966, + "step": 115 + }, + { + "epoch": 0.3005181347150259, + "grad_norm": 29.90506353145981, + "learning_rate": 3.897281868747878e-05, + "loss": 1.1888, + "step": 116 + }, + { + "epoch": 0.30310880829015546, + "grad_norm": 36.04602777809767, + "learning_rate": 3.894548944597434e-05, + "loss": 1.2066, + "step": 117 + }, + { + "epoch": 0.30569948186528495, + "grad_norm": 36.42793844948301, + "learning_rate": 3.8917811238766606e-05, + "loss": 1.1712, + "step": 118 + }, + { + "epoch": 0.3082901554404145, + "grad_norm": 58.788967662325696, + "learning_rate": 3.888978457567323e-05, + "loss": 1.1225, + "step": 119 + }, + { + "epoch": 0.31088082901554404, + "grad_norm": 29.357299816022326, + "learning_rate": 3.886140997293024e-05, + "loss": 1.1315, + "step": 120 + }, + { + "epoch": 0.3134715025906736, + "grad_norm": 95.08345317107502, + "learning_rate": 3.883268795318252e-05, + "loss": 1.1852, + "step": 121 + }, + { + "epoch": 0.3160621761658031, + "grad_norm": 33.6623824593179, + "learning_rate": 3.88036190454742e-05, + "loss": 1.16, + "step": 122 + }, + { + "epoch": 0.31865284974093266, + "grad_norm": 42.587546987131105, + "learning_rate": 3.8774203785238886e-05, + "loss": 1.1374, + "step": 123 + }, + { + "epoch": 0.32124352331606215, + "grad_norm": 33.360649853064245, + "learning_rate": 3.8744442714289816e-05, + "loss": 1.1757, + "step": 124 + }, + { + "epoch": 0.3238341968911917, + "grad_norm": 49.09256643961471, + "learning_rate": 3.8714336380809874e-05, + "loss": 1.1782, + "step": 125 + }, + { + "epoch": 0.32642487046632124, + "grad_norm": 31.505007051172793, + "learning_rate": 3.86838853393415e-05, + "loss": 1.195, + "step": 126 + }, + { + "epoch": 0.3290155440414508, + "grad_norm": 34.36735417254799, + "learning_rate": 3.865309015077645e-05, + "loss": 1.1078, + "step": 127 + }, + { + "epoch": 0.3316062176165803, + "grad_norm": 36.63220606142181, + "learning_rate": 3.862195138234551e-05, + "loss": 1.1319, + "step": 128 + }, + { + "epoch": 0.33419689119170987, + "grad_norm": 53.324986862513676, + "learning_rate": 3.859046960760801e-05, + "loss": 1.2301, + "step": 129 + }, + { + "epoch": 0.33678756476683935, + "grad_norm": 47.41445409144979, + "learning_rate": 3.855864540644126e-05, + "loss": 1.2366, + "step": 130 + }, + { + "epoch": 0.3393782383419689, + "grad_norm": 32.57355122427366, + "learning_rate": 3.8526479365029906e-05, + "loss": 1.142, + "step": 131 + }, + { + "epoch": 0.34196891191709844, + "grad_norm": 28.445824333644715, + "learning_rate": 3.849397207585508e-05, + "loss": 1.0847, + "step": 132 + }, + { + "epoch": 0.344559585492228, + "grad_norm": 49.23062726715889, + "learning_rate": 3.846112413768353e-05, + "loss": 1.2241, + "step": 133 + }, + { + "epoch": 0.3471502590673575, + "grad_norm": 53.424206543788074, + "learning_rate": 3.842793615555657e-05, + "loss": 1.2392, + "step": 134 + }, + { + "epoch": 0.34974093264248707, + "grad_norm": 38.19316140175426, + "learning_rate": 3.8394408740778934e-05, + "loss": 1.1208, + "step": 135 + }, + { + "epoch": 0.35233160621761656, + "grad_norm": 32.35931252369273, + "learning_rate": 3.836054251090755e-05, + "loss": 1.1604, + "step": 136 + }, + { + "epoch": 0.3549222797927461, + "grad_norm": 37.90085344799495, + "learning_rate": 3.83263380897401e-05, + "loss": 1.1134, + "step": 137 + }, + { + "epoch": 0.35751295336787564, + "grad_norm": 44.49191588319939, + "learning_rate": 3.829179610730359e-05, + "loss": 1.1281, + "step": 138 + }, + { + "epoch": 0.3601036269430052, + "grad_norm": 141.98524430756757, + "learning_rate": 3.8256917199842715e-05, + "loss": 1.0928, + "step": 139 + }, + { + "epoch": 0.3626943005181347, + "grad_norm": 30.887093976524472, + "learning_rate": 3.822170200980815e-05, + "loss": 1.0936, + "step": 140 + }, + { + "epoch": 0.36528497409326427, + "grad_norm": 21.980521878837745, + "learning_rate": 3.818615118584472e-05, + "loss": 1.1368, + "step": 141 + }, + { + "epoch": 0.36787564766839376, + "grad_norm": 538.6650762618656, + "learning_rate": 3.815026538277943e-05, + "loss": 1.0918, + "step": 142 + }, + { + "epoch": 0.3704663212435233, + "grad_norm": 40.842881572203, + "learning_rate": 3.811404526160943e-05, + "loss": 1.1705, + "step": 143 + }, + { + "epoch": 0.37305699481865284, + "grad_norm": 26.891553492377298, + "learning_rate": 3.8077491489489835e-05, + "loss": 1.1468, + "step": 144 + }, + { + "epoch": 0.3756476683937824, + "grad_norm": 45.138483181178074, + "learning_rate": 3.8040604739721415e-05, + "loss": 1.1679, + "step": 145 + }, + { + "epoch": 0.37823834196891193, + "grad_norm": 35.133763086168244, + "learning_rate": 3.8003385691738227e-05, + "loss": 1.1029, + "step": 146 + }, + { + "epoch": 0.38082901554404147, + "grad_norm": 36.941250802707344, + "learning_rate": 3.7965835031095065e-05, + "loss": 1.1491, + "step": 147 + }, + { + "epoch": 0.38341968911917096, + "grad_norm": 90.1080256703095, + "learning_rate": 3.792795344945485e-05, + "loss": 1.1212, + "step": 148 + }, + { + "epoch": 0.3860103626943005, + "grad_norm": 39.70360899750413, + "learning_rate": 3.7889741644575914e-05, + "loss": 1.15, + "step": 149 + }, + { + "epoch": 0.38860103626943004, + "grad_norm": 28.229369877304094, + "learning_rate": 3.78512003202991e-05, + "loss": 1.1111, + "step": 150 + }, + { + "epoch": 0.3911917098445596, + "grad_norm": 31.611752191925987, + "learning_rate": 3.7812330186534815e-05, + "loss": 1.1366, + "step": 151 + }, + { + "epoch": 0.39378238341968913, + "grad_norm": 38.196015586772425, + "learning_rate": 3.777313195924998e-05, + "loss": 1.1433, + "step": 152 + }, + { + "epoch": 0.3963730569948187, + "grad_norm": 22.732638044547453, + "learning_rate": 3.773360636045481e-05, + "loss": 1.1125, + "step": 153 + }, + { + "epoch": 0.39896373056994816, + "grad_norm": 90.19158665385014, + "learning_rate": 3.7693754118189525e-05, + "loss": 1.1242, + "step": 154 + }, + { + "epoch": 0.4015544041450777, + "grad_norm": 42.43479974993017, + "learning_rate": 3.765357596651095e-05, + "loss": 1.1191, + "step": 155 + }, + { + "epoch": 0.40414507772020725, + "grad_norm": 88.0076735720364, + "learning_rate": 3.761307264547899e-05, + "loss": 1.1718, + "step": 156 + }, + { + "epoch": 0.4067357512953368, + "grad_norm": 30.782507703935767, + "learning_rate": 3.757224490114297e-05, + "loss": 1.109, + "step": 157 + }, + { + "epoch": 0.40932642487046633, + "grad_norm": 69.89871106113397, + "learning_rate": 3.7531093485527943e-05, + "loss": 1.1018, + "step": 158 + }, + { + "epoch": 0.4119170984455959, + "grad_norm": 37.339006645717305, + "learning_rate": 3.7489619156620796e-05, + "loss": 1.1358, + "step": 159 + }, + { + "epoch": 0.41450777202072536, + "grad_norm": 28.06388054378899, + "learning_rate": 3.744782267835632e-05, + "loss": 1.0847, + "step": 160 + }, + { + "epoch": 0.4170984455958549, + "grad_norm": 54.05874281297702, + "learning_rate": 3.740570482060311e-05, + "loss": 1.1682, + "step": 161 + }, + { + "epoch": 0.41968911917098445, + "grad_norm": 32.299093265328835, + "learning_rate": 3.73632663591494e-05, + "loss": 1.1413, + "step": 162 + }, + { + "epoch": 0.422279792746114, + "grad_norm": 31.213652090157694, + "learning_rate": 3.732050807568878e-05, + "loss": 1.1313, + "step": 163 + }, + { + "epoch": 0.42487046632124353, + "grad_norm": 40.01090035937505, + "learning_rate": 3.727743075780578e-05, + "loss": 1.1513, + "step": 164 + }, + { + "epoch": 0.4274611398963731, + "grad_norm": 47.11352577964853, + "learning_rate": 3.723403519896136e-05, + "loss": 1.2192, + "step": 165 + }, + { + "epoch": 0.43005181347150256, + "grad_norm": 28.645086506093037, + "learning_rate": 3.7190322198478355e-05, + "loss": 1.1097, + "step": 166 + }, + { + "epoch": 0.4326424870466321, + "grad_norm": 35.28541113925116, + "learning_rate": 3.7146292561526654e-05, + "loss": 1.1557, + "step": 167 + }, + { + "epoch": 0.43523316062176165, + "grad_norm": 58.30281063037669, + "learning_rate": 3.7101947099108425e-05, + "loss": 1.1829, + "step": 168 + }, + { + "epoch": 0.4378238341968912, + "grad_norm": 26.33563548968379, + "learning_rate": 3.70572866280432e-05, + "loss": 1.147, + "step": 169 + }, + { + "epoch": 0.44041450777202074, + "grad_norm": 57.00052875402651, + "learning_rate": 3.701231197095277e-05, + "loss": 1.1212, + "step": 170 + }, + { + "epoch": 0.4430051813471503, + "grad_norm": 23.672828037237174, + "learning_rate": 3.696702395624608e-05, + "loss": 1.1152, + "step": 171 + }, + { + "epoch": 0.44559585492227977, + "grad_norm": 41.1264174112964, + "learning_rate": 3.692142341810395e-05, + "loss": 1.1154, + "step": 172 + }, + { + "epoch": 0.4481865284974093, + "grad_norm": 26.72177706144361, + "learning_rate": 3.6875511196463715e-05, + "loss": 1.1725, + "step": 173 + }, + { + "epoch": 0.45077720207253885, + "grad_norm": 95.4088800585977, + "learning_rate": 3.682928813700375e-05, + "loss": 1.1339, + "step": 174 + }, + { + "epoch": 0.4533678756476684, + "grad_norm": 34.33666578349465, + "learning_rate": 3.678275509112788e-05, + "loss": 1.1867, + "step": 175 + }, + { + "epoch": 0.45595854922279794, + "grad_norm": 31.032304531003014, + "learning_rate": 3.6735912915949745e-05, + "loss": 1.1386, + "step": 176 + }, + { + "epoch": 0.4585492227979275, + "grad_norm": 55.22043313188224, + "learning_rate": 3.6688762474276945e-05, + "loss": 1.1102, + "step": 177 + }, + { + "epoch": 0.46113989637305697, + "grad_norm": 29.82713377876857, + "learning_rate": 3.6641304634595216e-05, + "loss": 1.1564, + "step": 178 + }, + { + "epoch": 0.4637305699481865, + "grad_norm": 35.71025459541737, + "learning_rate": 3.659354027105238e-05, + "loss": 1.0939, + "step": 179 + }, + { + "epoch": 0.46632124352331605, + "grad_norm": 52.41175655642653, + "learning_rate": 3.6545470263442265e-05, + "loss": 1.1578, + "step": 180 + }, + { + "epoch": 0.4689119170984456, + "grad_norm": 27.682485766528306, + "learning_rate": 3.649709549718849e-05, + "loss": 1.1875, + "step": 181 + }, + { + "epoch": 0.47150259067357514, + "grad_norm": 36.53293663303487, + "learning_rate": 3.6448416863328186e-05, + "loss": 1.1111, + "step": 182 + }, + { + "epoch": 0.4740932642487047, + "grad_norm": 31.45177998538027, + "learning_rate": 3.639943525849555e-05, + "loss": 1.113, + "step": 183 + }, + { + "epoch": 0.47668393782383417, + "grad_norm": 28.323097072885673, + "learning_rate": 3.635015158490533e-05, + "loss": 1.1159, + "step": 184 + }, + { + "epoch": 0.4792746113989637, + "grad_norm": 47.75573754341213, + "learning_rate": 3.6300566750336225e-05, + "loss": 1.1305, + "step": 185 + }, + { + "epoch": 0.48186528497409326, + "grad_norm": 21.384095061494357, + "learning_rate": 3.625068166811418e-05, + "loss": 1.1369, + "step": 186 + }, + { + "epoch": 0.4844559585492228, + "grad_norm": 30.714645036809546, + "learning_rate": 3.6200497257095504e-05, + "loss": 1.1858, + "step": 187 + }, + { + "epoch": 0.48704663212435234, + "grad_norm": 35.12161426399798, + "learning_rate": 3.615001444165001e-05, + "loss": 1.1293, + "step": 188 + }, + { + "epoch": 0.4896373056994819, + "grad_norm": 116.83443661381396, + "learning_rate": 3.6099234151643924e-05, + "loss": 1.1515, + "step": 189 + }, + { + "epoch": 0.49222797927461137, + "grad_norm": 55.47885243409044, + "learning_rate": 3.604815732242283e-05, + "loss": 1.112, + "step": 190 + }, + { + "epoch": 0.4948186528497409, + "grad_norm": 32.332747429034285, + "learning_rate": 3.5996784894794394e-05, + "loss": 1.1661, + "step": 191 + }, + { + "epoch": 0.49740932642487046, + "grad_norm": 33.039210183180046, + "learning_rate": 3.594511781501103e-05, + "loss": 1.1244, + "step": 192 + }, + { + "epoch": 0.5, + "grad_norm": 21.325687337182504, + "learning_rate": 3.58931570347525e-05, + "loss": 1.1634, + "step": 193 + }, + { + "epoch": 0.5025906735751295, + "grad_norm": 51.37599478469561, + "learning_rate": 3.584090351110838e-05, + "loss": 1.2106, + "step": 194 + }, + { + "epoch": 0.5025906735751295, + "eval_loss": 1.1119717359542847, + "eval_runtime": 49.6027, + "eval_samples_per_second": 14.999, + "eval_steps_per_second": 0.948, + "step": 194 + }, + { + "epoch": 0.5051813471502591, + "grad_norm": 42.105169991612456, + "learning_rate": 3.57883582065604e-05, + "loss": 1.1303, + "step": 195 + }, + { + "epoch": 0.5077720207253886, + "grad_norm": 37.14457014578168, + "learning_rate": 3.573552208896474e-05, + "loss": 1.1483, + "step": 196 + }, + { + "epoch": 0.5103626943005182, + "grad_norm": 28.56241612018119, + "learning_rate": 3.568239613153421e-05, + "loss": 1.0843, + "step": 197 + }, + { + "epoch": 0.5129533678756477, + "grad_norm": 35.399304035761865, + "learning_rate": 3.5628981312820315e-05, + "loss": 1.1177, + "step": 198 + }, + { + "epoch": 0.5155440414507773, + "grad_norm": 25.91156850470446, + "learning_rate": 3.557527861669522e-05, + "loss": 1.1215, + "step": 199 + }, + { + "epoch": 0.5181347150259067, + "grad_norm": 43.509516777992324, + "learning_rate": 3.552128903233363e-05, + "loss": 1.1532, + "step": 200 + }, + { + "epoch": 0.5207253886010362, + "grad_norm": 38.18164449834795, + "learning_rate": 3.54670135541946e-05, + "loss": 1.1142, + "step": 201 + }, + { + "epoch": 0.5233160621761658, + "grad_norm": 48.576743289054534, + "learning_rate": 3.541245318200318e-05, + "loss": 1.1152, + "step": 202 + }, + { + "epoch": 0.5259067357512953, + "grad_norm": 38.65411737007163, + "learning_rate": 3.5357608920732e-05, + "loss": 1.1607, + "step": 203 + }, + { + "epoch": 0.5284974093264249, + "grad_norm": 35.663493907396834, + "learning_rate": 3.530248178058282e-05, + "loss": 1.1273, + "step": 204 + }, + { + "epoch": 0.5310880829015544, + "grad_norm": 26.829817821665976, + "learning_rate": 3.5247072776967805e-05, + "loss": 1.1174, + "step": 205 + }, + { + "epoch": 0.533678756476684, + "grad_norm": 39.79604912152638, + "learning_rate": 3.519138293049097e-05, + "loss": 1.1811, + "step": 206 + }, + { + "epoch": 0.5362694300518135, + "grad_norm": 32.26179097390416, + "learning_rate": 3.513541326692925e-05, + "loss": 1.1346, + "step": 207 + }, + { + "epoch": 0.538860103626943, + "grad_norm": 24.35769329902787, + "learning_rate": 3.5079164817213684e-05, + "loss": 1.1061, + "step": 208 + }, + { + "epoch": 0.5414507772020726, + "grad_norm": 26.645546258363844, + "learning_rate": 3.5022638617410396e-05, + "loss": 1.0514, + "step": 209 + }, + { + "epoch": 0.5440414507772021, + "grad_norm": 105.19676603444857, + "learning_rate": 3.496583570870152e-05, + "loss": 1.1474, + "step": 210 + }, + { + "epoch": 0.5466321243523317, + "grad_norm": 61.600623030405885, + "learning_rate": 3.4908757137366006e-05, + "loss": 1.104, + "step": 211 + }, + { + "epoch": 0.5492227979274611, + "grad_norm": 31.65460129853052, + "learning_rate": 3.485140395476038e-05, + "loss": 1.0737, + "step": 212 + }, + { + "epoch": 0.5518134715025906, + "grad_norm": 26.860379117211497, + "learning_rate": 3.4793777217299346e-05, + "loss": 1.1119, + "step": 213 + }, + { + "epoch": 0.5544041450777202, + "grad_norm": 39.89324262309783, + "learning_rate": 3.473587798643633e-05, + "loss": 1.1626, + "step": 214 + }, + { + "epoch": 0.5569948186528497, + "grad_norm": 39.77638257731599, + "learning_rate": 3.467770732864399e-05, + "loss": 1.1545, + "step": 215 + }, + { + "epoch": 0.5595854922279793, + "grad_norm": 30.994657564291458, + "learning_rate": 3.461926631539445e-05, + "loss": 1.1646, + "step": 216 + }, + { + "epoch": 0.5621761658031088, + "grad_norm": 51.99674092516571, + "learning_rate": 3.4560556023139695e-05, + "loss": 1.1638, + "step": 217 + }, + { + "epoch": 0.5647668393782384, + "grad_norm": 58.5132713002146, + "learning_rate": 3.450157753329166e-05, + "loss": 1.1461, + "step": 218 + }, + { + "epoch": 0.5673575129533679, + "grad_norm": 30.712469030418482, + "learning_rate": 3.4442331932202326e-05, + "loss": 1.1583, + "step": 219 + }, + { + "epoch": 0.5699481865284974, + "grad_norm": 47.00217426642832, + "learning_rate": 3.438282031114374e-05, + "loss": 1.1154, + "step": 220 + }, + { + "epoch": 0.572538860103627, + "grad_norm": 37.33927961163222, + "learning_rate": 3.432304376628787e-05, + "loss": 1.1372, + "step": 221 + }, + { + "epoch": 0.5751295336787565, + "grad_norm": 28.858636933974392, + "learning_rate": 3.4263003398686464e-05, + "loss": 1.0488, + "step": 222 + }, + { + "epoch": 0.5777202072538861, + "grad_norm": 37.842230890171486, + "learning_rate": 3.420270031425072e-05, + "loss": 1.1892, + "step": 223 + }, + { + "epoch": 0.5803108808290155, + "grad_norm": 32.65394945357516, + "learning_rate": 3.4142135623730954e-05, + "loss": 1.1218, + "step": 224 + }, + { + "epoch": 0.582901554404145, + "grad_norm": 115.22040829465772, + "learning_rate": 3.4081310442696114e-05, + "loss": 1.1546, + "step": 225 + }, + { + "epoch": 0.5854922279792746, + "grad_norm": 31.20514468446119, + "learning_rate": 3.402022589151325e-05, + "loss": 1.0969, + "step": 226 + }, + { + "epoch": 0.5880829015544041, + "grad_norm": 52.8397361926395, + "learning_rate": 3.395888309532687e-05, + "loss": 1.1218, + "step": 227 + }, + { + "epoch": 0.5906735751295337, + "grad_norm": 51.7991692917308, + "learning_rate": 3.3897283184038215e-05, + "loss": 1.1395, + "step": 228 + }, + { + "epoch": 0.5932642487046632, + "grad_norm": 33.56775233970504, + "learning_rate": 3.3835427292284445e-05, + "loss": 1.1107, + "step": 229 + }, + { + "epoch": 0.5958549222797928, + "grad_norm": 46.081120788214314, + "learning_rate": 3.3773316559417734e-05, + "loss": 1.1472, + "step": 230 + }, + { + "epoch": 0.5984455958549223, + "grad_norm": 41.72558170492288, + "learning_rate": 3.371095212948431e-05, + "loss": 1.1871, + "step": 231 + }, + { + "epoch": 0.6010362694300518, + "grad_norm": 34.27957927587091, + "learning_rate": 3.364833515120336e-05, + "loss": 1.1376, + "step": 232 + }, + { + "epoch": 0.6036269430051814, + "grad_norm": 36.58452602010953, + "learning_rate": 3.358546677794586e-05, + "loss": 1.1885, + "step": 233 + }, + { + "epoch": 0.6062176165803109, + "grad_norm": 28.010809914189192, + "learning_rate": 3.352234816771337e-05, + "loss": 1.102, + "step": 234 + }, + { + "epoch": 0.6088082901554405, + "grad_norm": 24.78419558611963, + "learning_rate": 3.3458980483116664e-05, + "loss": 1.0818, + "step": 235 + }, + { + "epoch": 0.6113989637305699, + "grad_norm": 28.12830040081226, + "learning_rate": 3.3395364891354316e-05, + "loss": 1.1862, + "step": 236 + }, + { + "epoch": 0.6139896373056994, + "grad_norm": 37.94181651161551, + "learning_rate": 3.333150256419127e-05, + "loss": 1.147, + "step": 237 + }, + { + "epoch": 0.616580310880829, + "grad_norm": 21.809518482701854, + "learning_rate": 3.3267394677937134e-05, + "loss": 1.0994, + "step": 238 + }, + { + "epoch": 0.6191709844559585, + "grad_norm": 32.12135773753589, + "learning_rate": 3.320304241342464e-05, + "loss": 1.1531, + "step": 239 + }, + { + "epoch": 0.6217616580310881, + "grad_norm": 51.959731073524054, + "learning_rate": 3.31384469559878e-05, + "loss": 1.1717, + "step": 240 + }, + { + "epoch": 0.6243523316062176, + "grad_norm": 28.045815836372345, + "learning_rate": 3.307360949544012e-05, + "loss": 1.1814, + "step": 241 + }, + { + "epoch": 0.6269430051813472, + "grad_norm": 39.55208384578746, + "learning_rate": 3.300853122605268e-05, + "loss": 1.1483, + "step": 242 + }, + { + "epoch": 0.6295336787564767, + "grad_norm": 29.799974205160808, + "learning_rate": 3.294321334653213e-05, + "loss": 1.1838, + "step": 243 + }, + { + "epoch": 0.6321243523316062, + "grad_norm": 124.31035254102245, + "learning_rate": 3.2877657059998584e-05, + "loss": 1.0698, + "step": 244 + }, + { + "epoch": 0.6347150259067358, + "grad_norm": 37.989925180187655, + "learning_rate": 3.281186357396351e-05, + "loss": 1.0984, + "step": 245 + }, + { + "epoch": 0.6373056994818653, + "grad_norm": 55.72599333657572, + "learning_rate": 3.274583410030745e-05, + "loss": 1.2333, + "step": 246 + }, + { + "epoch": 0.6398963730569949, + "grad_norm": 46.77079456439719, + "learning_rate": 3.267956985525774e-05, + "loss": 1.2157, + "step": 247 + }, + { + "epoch": 0.6424870466321243, + "grad_norm": 33.62329915252562, + "learning_rate": 3.261307205936603e-05, + "loss": 1.1752, + "step": 248 + }, + { + "epoch": 0.6450777202072538, + "grad_norm": 34.11794183225494, + "learning_rate": 3.2546341937485884e-05, + "loss": 1.1265, + "step": 249 + }, + { + "epoch": 0.6476683937823834, + "grad_norm": 36.027636323913896, + "learning_rate": 3.247938071875017e-05, + "loss": 1.103, + "step": 250 + }, + { + "epoch": 0.6502590673575129, + "grad_norm": 35.393219337329946, + "learning_rate": 3.2412189636548456e-05, + "loss": 1.1148, + "step": 251 + }, + { + "epoch": 0.6528497409326425, + "grad_norm": 31.578919022569924, + "learning_rate": 3.234476992850425e-05, + "loss": 1.1149, + "step": 252 + }, + { + "epoch": 0.655440414507772, + "grad_norm": 28.93717647736964, + "learning_rate": 3.227712283645224e-05, + "loss": 1.1425, + "step": 253 + }, + { + "epoch": 0.6580310880829016, + "grad_norm": 34.170026750703684, + "learning_rate": 3.2209249606415394e-05, + "loss": 1.1591, + "step": 254 + }, + { + "epoch": 0.6606217616580311, + "grad_norm": 27.52194954061608, + "learning_rate": 3.214115148858201e-05, + "loss": 1.1704, + "step": 255 + }, + { + "epoch": 0.6632124352331606, + "grad_norm": 81.65404753769732, + "learning_rate": 3.207282973728273e-05, + "loss": 1.161, + "step": 256 + }, + { + "epoch": 0.6658031088082902, + "grad_norm": 57.45351536522683, + "learning_rate": 3.200428561096737e-05, + "loss": 1.116, + "step": 257 + }, + { + "epoch": 0.6683937823834197, + "grad_norm": 30.968529074463714, + "learning_rate": 3.193552037218179e-05, + "loss": 1.1265, + "step": 258 + }, + { + "epoch": 0.6709844559585493, + "grad_norm": 37.8817748068655, + "learning_rate": 3.186653528754464e-05, + "loss": 1.1287, + "step": 259 + }, + { + "epoch": 0.6735751295336787, + "grad_norm": 29.197031189172545, + "learning_rate": 3.179733162772398e-05, + "loss": 1.1045, + "step": 260 + }, + { + "epoch": 0.6761658031088082, + "grad_norm": 36.56253841299107, + "learning_rate": 3.172791066741392e-05, + "loss": 1.1539, + "step": 261 + }, + { + "epoch": 0.6787564766839378, + "grad_norm": 25.799921116950998, + "learning_rate": 3.165827368531113e-05, + "loss": 1.0796, + "step": 262 + }, + { + "epoch": 0.6813471502590673, + "grad_norm": 82.81825216532526, + "learning_rate": 3.1588421964091276e-05, + "loss": 1.142, + "step": 263 + }, + { + "epoch": 0.6839378238341969, + "grad_norm": 31.100074747569124, + "learning_rate": 3.151835679038542e-05, + "loss": 1.0908, + "step": 264 + }, + { + "epoch": 0.6865284974093264, + "grad_norm": 25.57297200703221, + "learning_rate": 3.14480794547563e-05, + "loss": 1.1436, + "step": 265 + }, + { + "epoch": 0.689119170984456, + "grad_norm": 23.92492773149328, + "learning_rate": 3.137759125167455e-05, + "loss": 1.1202, + "step": 266 + }, + { + "epoch": 0.6917098445595855, + "grad_norm": 22.14274360766396, + "learning_rate": 3.130689347949486e-05, + "loss": 1.1113, + "step": 267 + }, + { + "epoch": 0.694300518134715, + "grad_norm": 26.68725288649902, + "learning_rate": 3.123598744043211e-05, + "loss": 1.1517, + "step": 268 + }, + { + "epoch": 0.6968911917098446, + "grad_norm": 25.559817524659362, + "learning_rate": 3.1164874440537295e-05, + "loss": 1.0976, + "step": 269 + }, + { + "epoch": 0.6994818652849741, + "grad_norm": 28.89996834100355, + "learning_rate": 3.109355578967356e-05, + "loss": 1.1932, + "step": 270 + }, + { + "epoch": 0.7020725388601037, + "grad_norm": 32.09658045195569, + "learning_rate": 3.1022032801492e-05, + "loss": 1.1161, + "step": 271 + }, + { + "epoch": 0.7046632124352331, + "grad_norm": 30.623705646213768, + "learning_rate": 3.095030679340751e-05, + "loss": 1.1993, + "step": 272 + }, + { + "epoch": 0.7072538860103627, + "grad_norm": 41.71263710932429, + "learning_rate": 3.0878379086574494e-05, + "loss": 1.1624, + "step": 273 + }, + { + "epoch": 0.7098445595854922, + "grad_norm": 34.68352639470226, + "learning_rate": 3.0806251005862535e-05, + "loss": 1.1156, + "step": 274 + }, + { + "epoch": 0.7124352331606217, + "grad_norm": 23.52580702428812, + "learning_rate": 3.073392387983202e-05, + "loss": 1.0963, + "step": 275 + }, + { + "epoch": 0.7150259067357513, + "grad_norm": 28.10687988214902, + "learning_rate": 3.0661399040709584e-05, + "loss": 1.1095, + "step": 276 + }, + { + "epoch": 0.7176165803108808, + "grad_norm": 66.72288729975841, + "learning_rate": 3.05886778243637e-05, + "loss": 1.0865, + "step": 277 + }, + { + "epoch": 0.7202072538860104, + "grad_norm": 25.775217430321934, + "learning_rate": 3.051576157027998e-05, + "loss": 1.1058, + "step": 278 + }, + { + "epoch": 0.7227979274611399, + "grad_norm": 36.82942099016794, + "learning_rate": 3.0442651621536502e-05, + "loss": 1.1211, + "step": 279 + }, + { + "epoch": 0.7253886010362695, + "grad_norm": 27.878820856521013, + "learning_rate": 3.0369349324779115e-05, + "loss": 1.1471, + "step": 280 + }, + { + "epoch": 0.727979274611399, + "grad_norm": 31.293156717285573, + "learning_rate": 3.0295856030196618e-05, + "loss": 1.0748, + "step": 281 + }, + { + "epoch": 0.7305699481865285, + "grad_norm": 39.315952115194435, + "learning_rate": 3.022217309149588e-05, + "loss": 1.0993, + "step": 282 + }, + { + "epoch": 0.7331606217616581, + "grad_norm": 36.79954071435495, + "learning_rate": 3.0148301865876913e-05, + "loss": 1.1045, + "step": 283 + }, + { + "epoch": 0.7357512953367875, + "grad_norm": 26.127389502147167, + "learning_rate": 3.0074243714007875e-05, + "loss": 1.1424, + "step": 284 + }, + { + "epoch": 0.7383419689119171, + "grad_norm": 25.608778060317068, + "learning_rate": 3.0000000000000004e-05, + "loss": 1.1055, + "step": 285 + }, + { + "epoch": 0.7409326424870466, + "grad_norm": 36.22629669671894, + "learning_rate": 2.992557209138249e-05, + "loss": 1.0845, + "step": 286 + }, + { + "epoch": 0.7435233160621761, + "grad_norm": 35.30642111132886, + "learning_rate": 2.9850961359077293e-05, + "loss": 1.204, + "step": 287 + }, + { + "epoch": 0.7461139896373057, + "grad_norm": 29.765894622087952, + "learning_rate": 2.977616917737388e-05, + "loss": 1.168, + "step": 288 + }, + { + "epoch": 0.7487046632124352, + "grad_norm": 27.194683587397567, + "learning_rate": 2.9701196923903927e-05, + "loss": 1.1236, + "step": 289 + }, + { + "epoch": 0.7512953367875648, + "grad_norm": 63.09779240191165, + "learning_rate": 2.9626045979615928e-05, + "loss": 1.1395, + "step": 290 + }, + { + "epoch": 0.7538860103626943, + "grad_norm": 25.014233377763066, + "learning_rate": 2.9550717728749768e-05, + "loss": 1.1054, + "step": 291 + }, + { + "epoch": 0.7538860103626943, + "eval_loss": 1.0996382236480713, + "eval_runtime": 37.9545, + "eval_samples_per_second": 19.602, + "eval_steps_per_second": 1.238, + "step": 291 + }, + { + "epoch": 0.7564766839378239, + "grad_norm": 27.481891737318097, + "learning_rate": 2.947521355881122e-05, + "loss": 1.1252, + "step": 292 + }, + { + "epoch": 0.7590673575129534, + "grad_norm": 67.57807413949878, + "learning_rate": 2.9399534860546404e-05, + "loss": 1.1761, + "step": 293 + }, + { + "epoch": 0.7616580310880829, + "grad_norm": 65.66834495909988, + "learning_rate": 2.932368302791614e-05, + "loss": 1.0551, + "step": 294 + }, + { + "epoch": 0.7642487046632125, + "grad_norm": 30.051210942517116, + "learning_rate": 2.92476594580703e-05, + "loss": 1.138, + "step": 295 + }, + { + "epoch": 0.7668393782383419, + "grad_norm": 22.693089678510507, + "learning_rate": 2.917146555132206e-05, + "loss": 1.1495, + "step": 296 + }, + { + "epoch": 0.7694300518134715, + "grad_norm": 53.84166280540606, + "learning_rate": 2.909510271112212e-05, + "loss": 1.1409, + "step": 297 + }, + { + "epoch": 0.772020725388601, + "grad_norm": 32.69106061524578, + "learning_rate": 2.9018572344032823e-05, + "loss": 1.1709, + "step": 298 + }, + { + "epoch": 0.7746113989637305, + "grad_norm": 39.44484991312582, + "learning_rate": 2.8941875859702283e-05, + "loss": 1.1138, + "step": 299 + }, + { + "epoch": 0.7772020725388601, + "grad_norm": 31.51857596969122, + "learning_rate": 2.88650146708384e-05, + "loss": 1.1931, + "step": 300 + }, + { + "epoch": 0.7797927461139896, + "grad_norm": 70.51218412614058, + "learning_rate": 2.878799019318283e-05, + "loss": 1.155, + "step": 301 + }, + { + "epoch": 0.7823834196891192, + "grad_norm": 80.27969224752457, + "learning_rate": 2.8710803845484955e-05, + "loss": 1.1425, + "step": 302 + }, + { + "epoch": 0.7849740932642487, + "grad_norm": 28.16560857981767, + "learning_rate": 2.8633457049475678e-05, + "loss": 1.1072, + "step": 303 + }, + { + "epoch": 0.7875647668393783, + "grad_norm": 41.15138307552231, + "learning_rate": 2.855595122984129e-05, + "loss": 1.1492, + "step": 304 + }, + { + "epoch": 0.7901554404145078, + "grad_norm": 23.894217282116276, + "learning_rate": 2.847828781419722e-05, + "loss": 1.1136, + "step": 305 + }, + { + "epoch": 0.7927461139896373, + "grad_norm": 25.005501120810248, + "learning_rate": 2.8400468233061708e-05, + "loss": 1.0921, + "step": 306 + }, + { + "epoch": 0.7953367875647669, + "grad_norm": 30.91791938195468, + "learning_rate": 2.832249391982949e-05, + "loss": 1.1098, + "step": 307 + }, + { + "epoch": 0.7979274611398963, + "grad_norm": 44.776563922922726, + "learning_rate": 2.8244366310745398e-05, + "loss": 1.1845, + "step": 308 + }, + { + "epoch": 0.8005181347150259, + "grad_norm": 19.059329544784376, + "learning_rate": 2.816608684487787e-05, + "loss": 1.169, + "step": 309 + }, + { + "epoch": 0.8031088082901554, + "grad_norm": 63.97334641962602, + "learning_rate": 2.8087656964092472e-05, + "loss": 1.124, + "step": 310 + }, + { + "epoch": 0.805699481865285, + "grad_norm": 30.878848859015882, + "learning_rate": 2.8009078113025335e-05, + "loss": 1.2087, + "step": 311 + }, + { + "epoch": 0.8082901554404145, + "grad_norm": 34.63835471543836, + "learning_rate": 2.7930351739056533e-05, + "loss": 1.1338, + "step": 312 + }, + { + "epoch": 0.810880829015544, + "grad_norm": 30.03178182445718, + "learning_rate": 2.7851479292283442e-05, + "loss": 1.1321, + "step": 313 + }, + { + "epoch": 0.8134715025906736, + "grad_norm": 38.42236523356876, + "learning_rate": 2.7772462225494013e-05, + "loss": 1.1557, + "step": 314 + }, + { + "epoch": 0.8160621761658031, + "grad_norm": 39.179683790956744, + "learning_rate": 2.7693301994140026e-05, + "loss": 1.1201, + "step": 315 + }, + { + "epoch": 0.8186528497409327, + "grad_norm": 38.32243159447327, + "learning_rate": 2.761400005631028e-05, + "loss": 1.1105, + "step": 316 + }, + { + "epoch": 0.8212435233160622, + "grad_norm": 39.913808227411835, + "learning_rate": 2.7534557872703705e-05, + "loss": 1.1598, + "step": 317 + }, + { + "epoch": 0.8238341968911918, + "grad_norm": 69.73521867812421, + "learning_rate": 2.7454976906602513e-05, + "loss": 1.1145, + "step": 318 + }, + { + "epoch": 0.8264248704663213, + "grad_norm": 65.55887588207746, + "learning_rate": 2.7375258623845207e-05, + "loss": 1.1255, + "step": 319 + }, + { + "epoch": 0.8290155440414507, + "grad_norm": 30.980111545641563, + "learning_rate": 2.7295404492799575e-05, + "loss": 1.122, + "step": 320 + }, + { + "epoch": 0.8316062176165803, + "grad_norm": 30.12179911444832, + "learning_rate": 2.721541598433567e-05, + "loss": 1.113, + "step": 321 + }, + { + "epoch": 0.8341968911917098, + "grad_norm": 28.329434659508582, + "learning_rate": 2.7135294571798706e-05, + "loss": 1.0498, + "step": 322 + }, + { + "epoch": 0.8367875647668394, + "grad_norm": 25.114787597049578, + "learning_rate": 2.70550417309819e-05, + "loss": 1.0633, + "step": 323 + }, + { + "epoch": 0.8393782383419689, + "grad_norm": 27.754037709590385, + "learning_rate": 2.6974658940099337e-05, + "loss": 1.1585, + "step": 324 + }, + { + "epoch": 0.8419689119170984, + "grad_norm": 29.489888159179444, + "learning_rate": 2.6894147679758678e-05, + "loss": 1.1259, + "step": 325 + }, + { + "epoch": 0.844559585492228, + "grad_norm": 24.426102194202898, + "learning_rate": 2.6813509432933957e-05, + "loss": 1.1515, + "step": 326 + }, + { + "epoch": 0.8471502590673575, + "grad_norm": 24.75197483331429, + "learning_rate": 2.673274568493821e-05, + "loss": 1.15, + "step": 327 + }, + { + "epoch": 0.8497409326424871, + "grad_norm": 40.604864626683366, + "learning_rate": 2.6651857923396132e-05, + "loss": 1.1219, + "step": 328 + }, + { + "epoch": 0.8523316062176166, + "grad_norm": 34.694568404196026, + "learning_rate": 2.6570847638216698e-05, + "loss": 1.103, + "step": 329 + }, + { + "epoch": 0.8549222797927462, + "grad_norm": 48.715136403425035, + "learning_rate": 2.648971632156569e-05, + "loss": 1.1675, + "step": 330 + }, + { + "epoch": 0.8575129533678757, + "grad_norm": 97.77526410121799, + "learning_rate": 2.6408465467838225e-05, + "loss": 1.1502, + "step": 331 + }, + { + "epoch": 0.8601036269430051, + "grad_norm": 54.697215318949276, + "learning_rate": 2.632709657363124e-05, + "loss": 1.1446, + "step": 332 + }, + { + "epoch": 0.8626943005181347, + "grad_norm": 38.09192002041798, + "learning_rate": 2.6245611137715897e-05, + "loss": 1.1333, + "step": 333 + }, + { + "epoch": 0.8652849740932642, + "grad_norm": 46.713623556984956, + "learning_rate": 2.6164010661010007e-05, + "loss": 1.1252, + "step": 334 + }, + { + "epoch": 0.8678756476683938, + "grad_norm": 46.40552686286593, + "learning_rate": 2.6082296646550364e-05, + "loss": 1.121, + "step": 335 + }, + { + "epoch": 0.8704663212435233, + "grad_norm": 37.57424454065957, + "learning_rate": 2.6000470599465065e-05, + "loss": 1.1671, + "step": 336 + }, + { + "epoch": 0.8730569948186528, + "grad_norm": 38.580777053099204, + "learning_rate": 2.5918534026945787e-05, + "loss": 1.0849, + "step": 337 + }, + { + "epoch": 0.8756476683937824, + "grad_norm": 154.3106712010981, + "learning_rate": 2.5836488438220044e-05, + "loss": 1.0663, + "step": 338 + }, + { + "epoch": 0.8782383419689119, + "grad_norm": 34.21394067951015, + "learning_rate": 2.575433534452334e-05, + "loss": 1.0895, + "step": 339 + }, + { + "epoch": 0.8808290155440415, + "grad_norm": 36.291611242733886, + "learning_rate": 2.5672076259071385e-05, + "loss": 1.1242, + "step": 340 + }, + { + "epoch": 0.883419689119171, + "grad_norm": 29.411623389655112, + "learning_rate": 2.558971269703219e-05, + "loss": 1.1005, + "step": 341 + }, + { + "epoch": 0.8860103626943006, + "grad_norm": 30.24903086761753, + "learning_rate": 2.5507246175498174e-05, + "loss": 1.1134, + "step": 342 + }, + { + "epoch": 0.8886010362694301, + "grad_norm": 22.032293114161938, + "learning_rate": 2.5424678213458202e-05, + "loss": 1.1121, + "step": 343 + }, + { + "epoch": 0.8911917098445595, + "grad_norm": 34.997361528376956, + "learning_rate": 2.5342010331769635e-05, + "loss": 1.1341, + "step": 344 + }, + { + "epoch": 0.8937823834196891, + "grad_norm": 28.212824875732352, + "learning_rate": 2.5259244053130295e-05, + "loss": 1.0748, + "step": 345 + }, + { + "epoch": 0.8963730569948186, + "grad_norm": 23.870011592985897, + "learning_rate": 2.5176380902050418e-05, + "loss": 1.0643, + "step": 346 + }, + { + "epoch": 0.8989637305699482, + "grad_norm": 26.10018699309748, + "learning_rate": 2.5093422404824574e-05, + "loss": 1.1662, + "step": 347 + }, + { + "epoch": 0.9015544041450777, + "grad_norm": 30.191468778559166, + "learning_rate": 2.5010370089503578e-05, + "loss": 1.1023, + "step": 348 + }, + { + "epoch": 0.9041450777202072, + "grad_norm": 55.799581973427415, + "learning_rate": 2.4927225485866297e-05, + "loss": 1.1538, + "step": 349 + }, + { + "epoch": 0.9067357512953368, + "grad_norm": 35.7030284720465, + "learning_rate": 2.4843990125391516e-05, + "loss": 1.1, + "step": 350 + }, + { + "epoch": 0.9093264248704663, + "grad_norm": 28.61763302791738, + "learning_rate": 2.4760665541229712e-05, + "loss": 1.0914, + "step": 351 + }, + { + "epoch": 0.9119170984455959, + "grad_norm": 33.34233685155311, + "learning_rate": 2.467725326817481e-05, + "loss": 1.0862, + "step": 352 + }, + { + "epoch": 0.9145077720207254, + "grad_norm": 25.441052078480084, + "learning_rate": 2.4593754842635917e-05, + "loss": 1.1422, + "step": 353 + }, + { + "epoch": 0.917098445595855, + "grad_norm": 24.217974454985058, + "learning_rate": 2.451017180260902e-05, + "loss": 1.132, + "step": 354 + }, + { + "epoch": 0.9196891191709845, + "grad_norm": 57.986011465793155, + "learning_rate": 2.4426505687648653e-05, + "loss": 1.2082, + "step": 355 + }, + { + "epoch": 0.9222797927461139, + "grad_norm": 34.058264716876195, + "learning_rate": 2.4342758038839573e-05, + "loss": 1.1679, + "step": 356 + }, + { + "epoch": 0.9248704663212435, + "grad_norm": 28.621514922275253, + "learning_rate": 2.4258930398768317e-05, + "loss": 1.1319, + "step": 357 + }, + { + "epoch": 0.927461139896373, + "grad_norm": 35.33355417283227, + "learning_rate": 2.4175024311494835e-05, + "loss": 1.0705, + "step": 358 + }, + { + "epoch": 0.9300518134715026, + "grad_norm": 46.579572933583265, + "learning_rate": 2.4091041322524023e-05, + "loss": 1.0842, + "step": 359 + }, + { + "epoch": 0.9326424870466321, + "grad_norm": 35.494740787672974, + "learning_rate": 2.4006982978777263e-05, + "loss": 1.1072, + "step": 360 + }, + { + "epoch": 0.9352331606217616, + "grad_norm": 44.56606839509262, + "learning_rate": 2.392285082856394e-05, + "loss": 1.1125, + "step": 361 + }, + { + "epoch": 0.9378238341968912, + "grad_norm": 46.26363869084929, + "learning_rate": 2.3838646421552917e-05, + "loss": 1.1268, + "step": 362 + }, + { + "epoch": 0.9404145077720207, + "grad_norm": 89.17676267680146, + "learning_rate": 2.3754371308743975e-05, + "loss": 1.0893, + "step": 363 + }, + { + "epoch": 0.9430051813471503, + "grad_norm": 34.87700187494181, + "learning_rate": 2.367002704243927e-05, + "loss": 1.1203, + "step": 364 + }, + { + "epoch": 0.9455958549222798, + "grad_norm": 32.92806939217504, + "learning_rate": 2.3585615176214716e-05, + "loss": 1.1488, + "step": 365 + }, + { + "epoch": 0.9481865284974094, + "grad_norm": 27.27458755248548, + "learning_rate": 2.3501137264891396e-05, + "loss": 1.0874, + "step": 366 + }, + { + "epoch": 0.9507772020725389, + "grad_norm": 24.959123789739834, + "learning_rate": 2.3416594864506887e-05, + "loss": 1.1783, + "step": 367 + }, + { + "epoch": 0.9533678756476683, + "grad_norm": 31.838670988369724, + "learning_rate": 2.333198953228664e-05, + "loss": 1.0759, + "step": 368 + }, + { + "epoch": 0.9559585492227979, + "grad_norm": 28.112870222863155, + "learning_rate": 2.3247322826615276e-05, + "loss": 1.1481, + "step": 369 + }, + { + "epoch": 0.9585492227979274, + "grad_norm": 35.08461098450067, + "learning_rate": 2.316259630700787e-05, + "loss": 1.0953, + "step": 370 + }, + { + "epoch": 0.961139896373057, + "grad_norm": 37.80899503618479, + "learning_rate": 2.307781153408124e-05, + "loss": 1.1224, + "step": 371 + }, + { + "epoch": 0.9637305699481865, + "grad_norm": 31.644978122007387, + "learning_rate": 2.2992970069525202e-05, + "loss": 1.1608, + "step": 372 + }, + { + "epoch": 0.966321243523316, + "grad_norm": 23.51029318210938, + "learning_rate": 2.29080734760738e-05, + "loss": 1.0914, + "step": 373 + }, + { + "epoch": 0.9689119170984456, + "grad_norm": 28.97240481418573, + "learning_rate": 2.2823123317476522e-05, + "loss": 1.1117, + "step": 374 + }, + { + "epoch": 0.9715025906735751, + "grad_norm": 36.613893678320395, + "learning_rate": 2.273812115846951e-05, + "loss": 1.1118, + "step": 375 + }, + { + "epoch": 0.9740932642487047, + "grad_norm": 26.402979304578093, + "learning_rate": 2.2653068564746692e-05, + "loss": 1.13, + "step": 376 + }, + { + "epoch": 0.9766839378238342, + "grad_norm": 114.3000444613392, + "learning_rate": 2.2567967102931025e-05, + "loss": 1.1539, + "step": 377 + }, + { + "epoch": 0.9792746113989638, + "grad_norm": 26.861359932396834, + "learning_rate": 2.2482818340545534e-05, + "loss": 1.0566, + "step": 378 + }, + { + "epoch": 0.9818652849740933, + "grad_norm": 32.75509374223994, + "learning_rate": 2.2397623845984548e-05, + "loss": 1.1746, + "step": 379 + }, + { + "epoch": 0.9844559585492227, + "grad_norm": 34.11964206838379, + "learning_rate": 2.2312385188484718e-05, + "loss": 1.0834, + "step": 380 + }, + { + "epoch": 0.9870466321243523, + "grad_norm": 38.019564122226434, + "learning_rate": 2.2227103938096176e-05, + "loss": 1.1074, + "step": 381 + }, + { + "epoch": 0.9896373056994818, + "grad_norm": 39.5073811375391, + "learning_rate": 2.2141781665653584e-05, + "loss": 1.1082, + "step": 382 + }, + { + "epoch": 0.9922279792746114, + "grad_norm": 298.4258332795163, + "learning_rate": 2.205641994274721e-05, + "loss": 1.125, + "step": 383 + }, + { + "epoch": 0.9948186528497409, + "grad_norm": 36.444415670935506, + "learning_rate": 2.1971020341693973e-05, + "loss": 1.0935, + "step": 384 + }, + { + "epoch": 0.9974093264248705, + "grad_norm": 28.96533429210575, + "learning_rate": 2.188558443550849e-05, + "loss": 1.0957, + "step": 385 + }, + { + "epoch": 1.0, + "grad_norm": 66.41241684127401, + "learning_rate": 2.180011379787411e-05, + "loss": 1.1335, + "step": 386 + }, + { + "epoch": 1.0025906735751295, + "grad_norm": 28.75549619538953, + "learning_rate": 2.1714610003113887e-05, + "loss": 1.1316, + "step": 387 + }, + { + "epoch": 1.005181347150259, + "grad_norm": 26.911837500852275, + "learning_rate": 2.1629074626161647e-05, + "loss": 1.1026, + "step": 388 + }, + { + "epoch": 1.005181347150259, + "eval_loss": 1.0908173322677612, + "eval_runtime": 37.7642, + "eval_samples_per_second": 19.701, + "eval_steps_per_second": 1.245, + "step": 388 + }, + { + "epoch": 1.0077720207253886, + "grad_norm": 34.28722746775385, + "learning_rate": 2.1543509242532932e-05, + "loss": 1.1104, + "step": 389 + }, + { + "epoch": 1.0103626943005182, + "grad_norm": 37.97709310694863, + "learning_rate": 2.145791542829597e-05, + "loss": 1.0663, + "step": 390 + }, + { + "epoch": 1.0129533678756477, + "grad_norm": 39.379668162327384, + "learning_rate": 2.1372294760042686e-05, + "loss": 1.1405, + "step": 391 + }, + { + "epoch": 1.0155440414507773, + "grad_norm": 27.136201219298698, + "learning_rate": 2.1286648814859636e-05, + "loss": 1.0963, + "step": 392 + }, + { + "epoch": 1.0181347150259068, + "grad_norm": 39.34261641469313, + "learning_rate": 2.120097917029897e-05, + "loss": 1.1276, + "step": 393 + }, + { + "epoch": 1.0207253886010363, + "grad_norm": 46.77583801285328, + "learning_rate": 2.1115287404349357e-05, + "loss": 1.1171, + "step": 394 + }, + { + "epoch": 1.0233160621761659, + "grad_norm": 55.10335066695868, + "learning_rate": 2.1029575095406933e-05, + "loss": 1.0831, + "step": 395 + }, + { + "epoch": 1.0259067357512954, + "grad_norm": 76.88533851789373, + "learning_rate": 2.0943843822246234e-05, + "loss": 1.0925, + "step": 396 + }, + { + "epoch": 1.028497409326425, + "grad_norm": 29.604569209708462, + "learning_rate": 2.0858095163991094e-05, + "loss": 1.1259, + "step": 397 + }, + { + "epoch": 1.0310880829015545, + "grad_norm": 37.71348366628868, + "learning_rate": 2.077233070008557e-05, + "loss": 1.0792, + "step": 398 + }, + { + "epoch": 1.0336787564766838, + "grad_norm": 26.866133194031644, + "learning_rate": 2.0686552010264872e-05, + "loss": 1.1649, + "step": 399 + }, + { + "epoch": 1.0362694300518134, + "grad_norm": 35.739274800620635, + "learning_rate": 2.060076067452622e-05, + "loss": 1.0837, + "step": 400 + }, + { + "epoch": 1.038860103626943, + "grad_norm": 24.479129391259896, + "learning_rate": 2.0514958273099778e-05, + "loss": 1.073, + "step": 401 + }, + { + "epoch": 1.0414507772020725, + "grad_norm": 50.49963650108008, + "learning_rate": 2.042914638641952e-05, + "loss": 1.0912, + "step": 402 + }, + { + "epoch": 1.044041450777202, + "grad_norm": 35.6875451072032, + "learning_rate": 2.0343326595094154e-05, + "loss": 1.0936, + "step": 403 + }, + { + "epoch": 1.0466321243523315, + "grad_norm": 30.212298193414487, + "learning_rate": 2.0257500479877965e-05, + "loss": 1.089, + "step": 404 + }, + { + "epoch": 1.049222797927461, + "grad_norm": 28.65828720015124, + "learning_rate": 2.0171669621641743e-05, + "loss": 1.1727, + "step": 405 + }, + { + "epoch": 1.0518134715025906, + "grad_norm": 39.2199058392425, + "learning_rate": 2.0085835601343627e-05, + "loss": 1.1493, + "step": 406 + }, + { + "epoch": 1.0544041450777202, + "grad_norm": 110.01204177059546, + "learning_rate": 2e-05, + "loss": 1.1245, + "step": 407 + }, + { + "epoch": 1.0569948186528497, + "grad_norm": 43.427381349600374, + "learning_rate": 1.9914164398656383e-05, + "loss": 1.1183, + "step": 408 + }, + { + "epoch": 1.0595854922279793, + "grad_norm": 64.78768909817894, + "learning_rate": 1.9828330378358264e-05, + "loss": 1.1528, + "step": 409 + }, + { + "epoch": 1.0621761658031088, + "grad_norm": 26.50257915912425, + "learning_rate": 1.974249952012204e-05, + "loss": 1.1568, + "step": 410 + }, + { + "epoch": 1.0647668393782384, + "grad_norm": 27.63159204178893, + "learning_rate": 1.9656673404905852e-05, + "loss": 1.1071, + "step": 411 + }, + { + "epoch": 1.067357512953368, + "grad_norm": 27.0795355533723, + "learning_rate": 1.957085361358049e-05, + "loss": 1.0809, + "step": 412 + }, + { + "epoch": 1.0699481865284974, + "grad_norm": 41.84795332660821, + "learning_rate": 1.9485041726900232e-05, + "loss": 1.0744, + "step": 413 + }, + { + "epoch": 1.072538860103627, + "grad_norm": 143.2109134427192, + "learning_rate": 1.939923932547379e-05, + "loss": 1.0905, + "step": 414 + }, + { + "epoch": 1.0751295336787565, + "grad_norm": 89.55384065946154, + "learning_rate": 1.931344798973513e-05, + "loss": 1.1012, + "step": 415 + }, + { + "epoch": 1.077720207253886, + "grad_norm": 31.072074793068015, + "learning_rate": 1.922766929991443e-05, + "loss": 1.1141, + "step": 416 + }, + { + "epoch": 1.0803108808290156, + "grad_norm": 29.82683189045969, + "learning_rate": 1.914190483600891e-05, + "loss": 1.0842, + "step": 417 + }, + { + "epoch": 1.0829015544041452, + "grad_norm": 30.09708662586305, + "learning_rate": 1.9056156177753776e-05, + "loss": 1.1088, + "step": 418 + }, + { + "epoch": 1.0854922279792747, + "grad_norm": 27.637437518920503, + "learning_rate": 1.897042490459307e-05, + "loss": 1.058, + "step": 419 + }, + { + "epoch": 1.0880829015544042, + "grad_norm": 69.34285700381683, + "learning_rate": 1.8884712595650653e-05, + "loss": 1.0314, + "step": 420 + }, + { + "epoch": 1.0906735751295338, + "grad_norm": 25.644927284592956, + "learning_rate": 1.8799020829701036e-05, + "loss": 1.0916, + "step": 421 + }, + { + "epoch": 1.093264248704663, + "grad_norm": 30.3898986852319, + "learning_rate": 1.871335118514037e-05, + "loss": 1.0797, + "step": 422 + }, + { + "epoch": 1.0958549222797926, + "grad_norm": 22.271334693423444, + "learning_rate": 1.862770523995732e-05, + "loss": 1.1134, + "step": 423 + }, + { + "epoch": 1.0984455958549222, + "grad_norm": 35.85874616678876, + "learning_rate": 1.854208457170404e-05, + "loss": 1.0927, + "step": 424 + }, + { + "epoch": 1.1010362694300517, + "grad_norm": 43.06832041948097, + "learning_rate": 1.8456490757467075e-05, + "loss": 1.093, + "step": 425 + }, + { + "epoch": 1.1036269430051813, + "grad_norm": 37.83777637993467, + "learning_rate": 1.8370925373838356e-05, + "loss": 1.1268, + "step": 426 + }, + { + "epoch": 1.1062176165803108, + "grad_norm": 23.798059023605177, + "learning_rate": 1.8285389996886113e-05, + "loss": 1.0989, + "step": 427 + }, + { + "epoch": 1.1088082901554404, + "grad_norm": 25.443104465500795, + "learning_rate": 1.8199886202125897e-05, + "loss": 1.0581, + "step": 428 + }, + { + "epoch": 1.11139896373057, + "grad_norm": 23.76241444847441, + "learning_rate": 1.8114415564491513e-05, + "loss": 1.0908, + "step": 429 + }, + { + "epoch": 1.1139896373056994, + "grad_norm": 26.5600693044426, + "learning_rate": 1.8028979658306033e-05, + "loss": 1.1321, + "step": 430 + }, + { + "epoch": 1.116580310880829, + "grad_norm": 44.854375199828986, + "learning_rate": 1.794358005725279e-05, + "loss": 1.0762, + "step": 431 + }, + { + "epoch": 1.1191709844559585, + "grad_norm": 28.05797777410846, + "learning_rate": 1.785821833434642e-05, + "loss": 1.0698, + "step": 432 + }, + { + "epoch": 1.121761658031088, + "grad_norm": 26.488479630212364, + "learning_rate": 1.7772896061903824e-05, + "loss": 1.1223, + "step": 433 + }, + { + "epoch": 1.1243523316062176, + "grad_norm": 32.77084542157883, + "learning_rate": 1.768761481151529e-05, + "loss": 1.0984, + "step": 434 + }, + { + "epoch": 1.1269430051813472, + "grad_norm": 39.13198413130026, + "learning_rate": 1.7602376154015456e-05, + "loss": 1.1551, + "step": 435 + }, + { + "epoch": 1.1295336787564767, + "grad_norm": 23.878966995283953, + "learning_rate": 1.751718165945447e-05, + "loss": 1.1133, + "step": 436 + }, + { + "epoch": 1.1321243523316062, + "grad_norm": 33.90472985566232, + "learning_rate": 1.743203289706898e-05, + "loss": 1.1219, + "step": 437 + }, + { + "epoch": 1.1347150259067358, + "grad_norm": 23.340369938533712, + "learning_rate": 1.734693143525331e-05, + "loss": 1.1244, + "step": 438 + }, + { + "epoch": 1.1373056994818653, + "grad_norm": 105.6885206147852, + "learning_rate": 1.7261878841530494e-05, + "loss": 1.0788, + "step": 439 + }, + { + "epoch": 1.1398963730569949, + "grad_norm": 28.453526076458317, + "learning_rate": 1.717687668252348e-05, + "loss": 1.1576, + "step": 440 + }, + { + "epoch": 1.1424870466321244, + "grad_norm": 36.1473991485961, + "learning_rate": 1.7091926523926205e-05, + "loss": 1.0859, + "step": 441 + }, + { + "epoch": 1.145077720207254, + "grad_norm": 27.043461146902448, + "learning_rate": 1.7007029930474804e-05, + "loss": 1.1072, + "step": 442 + }, + { + "epoch": 1.1476683937823835, + "grad_norm": 28.066170619981435, + "learning_rate": 1.6922188465918763e-05, + "loss": 1.1279, + "step": 443 + }, + { + "epoch": 1.150259067357513, + "grad_norm": 38.62445822837212, + "learning_rate": 1.6837403692992136e-05, + "loss": 1.1275, + "step": 444 + }, + { + "epoch": 1.1528497409326426, + "grad_norm": 28.077258963587767, + "learning_rate": 1.6752677173384734e-05, + "loss": 1.1004, + "step": 445 + }, + { + "epoch": 1.1554404145077721, + "grad_norm": 42.1405744301338, + "learning_rate": 1.6668010467713363e-05, + "loss": 1.1141, + "step": 446 + }, + { + "epoch": 1.1580310880829017, + "grad_norm": 26.827291684301034, + "learning_rate": 1.658340513549312e-05, + "loss": 1.1216, + "step": 447 + }, + { + "epoch": 1.160621761658031, + "grad_norm": 30.863489441619983, + "learning_rate": 1.649886273510861e-05, + "loss": 1.1898, + "step": 448 + }, + { + "epoch": 1.1632124352331605, + "grad_norm": 27.73579733476068, + "learning_rate": 1.641438482378529e-05, + "loss": 1.0971, + "step": 449 + }, + { + "epoch": 1.16580310880829, + "grad_norm": 32.84347174567353, + "learning_rate": 1.6329972957560736e-05, + "loss": 1.0579, + "step": 450 + }, + { + "epoch": 1.1683937823834196, + "grad_norm": 30.06456192962641, + "learning_rate": 1.6245628691256032e-05, + "loss": 1.1057, + "step": 451 + }, + { + "epoch": 1.1709844559585492, + "grad_norm": 36.554506394377846, + "learning_rate": 1.616135357844709e-05, + "loss": 1.1008, + "step": 452 + }, + { + "epoch": 1.1735751295336787, + "grad_norm": 27.358643056184114, + "learning_rate": 1.6077149171436063e-05, + "loss": 1.101, + "step": 453 + }, + { + "epoch": 1.1761658031088082, + "grad_norm": 111.13373813893604, + "learning_rate": 1.599301702122274e-05, + "loss": 1.0688, + "step": 454 + }, + { + "epoch": 1.1787564766839378, + "grad_norm": 33.94168250727336, + "learning_rate": 1.590895867747599e-05, + "loss": 1.0721, + "step": 455 + }, + { + "epoch": 1.1813471502590673, + "grad_norm": 53.93978395349692, + "learning_rate": 1.582497568850517e-05, + "loss": 1.0584, + "step": 456 + }, + { + "epoch": 1.1839378238341969, + "grad_norm": 29.19245794937285, + "learning_rate": 1.574106960123169e-05, + "loss": 1.067, + "step": 457 + }, + { + "epoch": 1.1865284974093264, + "grad_norm": 28.06897801999048, + "learning_rate": 1.5657241961160434e-05, + "loss": 1.0899, + "step": 458 + }, + { + "epoch": 1.189119170984456, + "grad_norm": 52.31256652964293, + "learning_rate": 1.557349431235135e-05, + "loss": 1.0925, + "step": 459 + }, + { + "epoch": 1.1917098445595855, + "grad_norm": 65.39771110845307, + "learning_rate": 1.5489828197390988e-05, + "loss": 1.1448, + "step": 460 + }, + { + "epoch": 1.194300518134715, + "grad_norm": 27.062780348557254, + "learning_rate": 1.5406245157364093e-05, + "loss": 1.0871, + "step": 461 + }, + { + "epoch": 1.1968911917098446, + "grad_norm": 41.667025056250424, + "learning_rate": 1.5322746731825195e-05, + "loss": 1.048, + "step": 462 + }, + { + "epoch": 1.1994818652849741, + "grad_norm": 24.936669803360665, + "learning_rate": 1.5239334458770291e-05, + "loss": 1.1243, + "step": 463 + }, + { + "epoch": 1.2020725388601037, + "grad_norm": 26.65392149600558, + "learning_rate": 1.5156009874608484e-05, + "loss": 1.0919, + "step": 464 + }, + { + "epoch": 1.2046632124352332, + "grad_norm": 48.57730651937978, + "learning_rate": 1.5072774514133708e-05, + "loss": 1.1259, + "step": 465 + }, + { + "epoch": 1.2072538860103628, + "grad_norm": 31.34891257114439, + "learning_rate": 1.4989629910496424e-05, + "loss": 1.0733, + "step": 466 + }, + { + "epoch": 1.2098445595854923, + "grad_norm": 24.541559850584985, + "learning_rate": 1.4906577595175428e-05, + "loss": 1.1166, + "step": 467 + }, + { + "epoch": 1.2124352331606219, + "grad_norm": 20.4345832961354, + "learning_rate": 1.4823619097949584e-05, + "loss": 1.0916, + "step": 468 + }, + { + "epoch": 1.2150259067357512, + "grad_norm": 28.860712194727487, + "learning_rate": 1.4740755946869708e-05, + "loss": 1.1043, + "step": 469 + }, + { + "epoch": 1.2176165803108807, + "grad_norm": 25.71820242946282, + "learning_rate": 1.4657989668230363e-05, + "loss": 1.0949, + "step": 470 + }, + { + "epoch": 1.2202072538860103, + "grad_norm": 51.16994773097077, + "learning_rate": 1.4575321786541801e-05, + "loss": 1.141, + "step": 471 + }, + { + "epoch": 1.2227979274611398, + "grad_norm": 32.70442309640389, + "learning_rate": 1.4492753824501833e-05, + "loss": 1.1127, + "step": 472 + }, + { + "epoch": 1.2253886010362693, + "grad_norm": 21.913285172411495, + "learning_rate": 1.4410287302967813e-05, + "loss": 1.084, + "step": 473 + }, + { + "epoch": 1.2279792746113989, + "grad_norm": 34.45727214001296, + "learning_rate": 1.4327923740928613e-05, + "loss": 1.0836, + "step": 474 + }, + { + "epoch": 1.2305699481865284, + "grad_norm": 26.768013926034776, + "learning_rate": 1.4245664655476663e-05, + "loss": 1.1264, + "step": 475 + }, + { + "epoch": 1.233160621761658, + "grad_norm": 28.401965255935572, + "learning_rate": 1.4163511561779956e-05, + "loss": 1.0805, + "step": 476 + }, + { + "epoch": 1.2357512953367875, + "grad_norm": 29.19935757288793, + "learning_rate": 1.4081465973054216e-05, + "loss": 1.0825, + "step": 477 + }, + { + "epoch": 1.238341968911917, + "grad_norm": 24.55918541541201, + "learning_rate": 1.3999529400534941e-05, + "loss": 1.1164, + "step": 478 + }, + { + "epoch": 1.2409326424870466, + "grad_norm": 25.35635406268312, + "learning_rate": 1.3917703353449646e-05, + "loss": 1.1334, + "step": 479 + }, + { + "epoch": 1.2435233160621761, + "grad_norm": 45.453901005004184, + "learning_rate": 1.3835989338989996e-05, + "loss": 1.1387, + "step": 480 + }, + { + "epoch": 1.2461139896373057, + "grad_norm": 21.67852694202104, + "learning_rate": 1.375438886228411e-05, + "loss": 1.0846, + "step": 481 + }, + { + "epoch": 1.2487046632124352, + "grad_norm": 171.2474074894732, + "learning_rate": 1.3672903426368773e-05, + "loss": 1.1388, + "step": 482 + }, + { + "epoch": 1.2512953367875648, + "grad_norm": 43.18223835070906, + "learning_rate": 1.3591534532161781e-05, + "loss": 1.1483, + "step": 483 + }, + { + "epoch": 1.2538860103626943, + "grad_norm": 29.447332565856644, + "learning_rate": 1.3510283678434317e-05, + "loss": 1.07, + "step": 484 + }, + { + "epoch": 1.2564766839378239, + "grad_norm": 28.600251051615228, + "learning_rate": 1.3429152361783307e-05, + "loss": 1.0798, + "step": 485 + }, + { + "epoch": 1.2564766839378239, + "eval_loss": 1.085669755935669, + "eval_runtime": 38.1134, + "eval_samples_per_second": 19.521, + "eval_steps_per_second": 1.233, + "step": 485 + }, + { + "epoch": 1.2590673575129534, + "grad_norm": 47.124643074410464, + "learning_rate": 1.3348142076603876e-05, + "loss": 1.0875, + "step": 486 + }, + { + "epoch": 1.261658031088083, + "grad_norm": 42.06019726307143, + "learning_rate": 1.3267254315061797e-05, + "loss": 1.1429, + "step": 487 + }, + { + "epoch": 1.2642487046632125, + "grad_norm": 18.950734630756962, + "learning_rate": 1.318649056706605e-05, + "loss": 1.0747, + "step": 488 + }, + { + "epoch": 1.266839378238342, + "grad_norm": 31.903949502516806, + "learning_rate": 1.3105852320241326e-05, + "loss": 1.1041, + "step": 489 + }, + { + "epoch": 1.2694300518134716, + "grad_norm": 22.957473008085927, + "learning_rate": 1.3025341059900675e-05, + "loss": 1.1046, + "step": 490 + }, + { + "epoch": 1.2720207253886011, + "grad_norm": 22.325983256563678, + "learning_rate": 1.2944958269018103e-05, + "loss": 1.0643, + "step": 491 + }, + { + "epoch": 1.2746113989637307, + "grad_norm": 29.689383331974955, + "learning_rate": 1.2864705428201307e-05, + "loss": 1.0949, + "step": 492 + }, + { + "epoch": 1.2772020725388602, + "grad_norm": 25.338298442945575, + "learning_rate": 1.2784584015664337e-05, + "loss": 1.0725, + "step": 493 + }, + { + "epoch": 1.2797927461139897, + "grad_norm": 31.591732488078588, + "learning_rate": 1.2704595507200435e-05, + "loss": 1.0347, + "step": 494 + }, + { + "epoch": 1.2823834196891193, + "grad_norm": 42.96243570696118, + "learning_rate": 1.26247413761548e-05, + "loss": 1.1196, + "step": 495 + }, + { + "epoch": 1.2849740932642488, + "grad_norm": 26.559546676266024, + "learning_rate": 1.254502309339749e-05, + "loss": 1.0187, + "step": 496 + }, + { + "epoch": 1.2875647668393784, + "grad_norm": 27.58444017584016, + "learning_rate": 1.2465442127296297e-05, + "loss": 1.0985, + "step": 497 + }, + { + "epoch": 1.2901554404145077, + "grad_norm": 36.53028730423797, + "learning_rate": 1.2385999943689732e-05, + "loss": 1.068, + "step": 498 + }, + { + "epoch": 1.2927461139896372, + "grad_norm": 38.94837307599113, + "learning_rate": 1.2306698005859975e-05, + "loss": 1.0736, + "step": 499 + }, + { + "epoch": 1.2953367875647668, + "grad_norm": 36.67208266195125, + "learning_rate": 1.2227537774505996e-05, + "loss": 1.119, + "step": 500 + }, + { + "epoch": 1.2979274611398963, + "grad_norm": 31.086410648635283, + "learning_rate": 1.2148520707716567e-05, + "loss": 1.1094, + "step": 501 + }, + { + "epoch": 1.3005181347150259, + "grad_norm": 27.96977481605826, + "learning_rate": 1.2069648260943473e-05, + "loss": 1.1345, + "step": 502 + }, + { + "epoch": 1.3031088082901554, + "grad_norm": 22.89450502840197, + "learning_rate": 1.1990921886974669e-05, + "loss": 1.12, + "step": 503 + }, + { + "epoch": 1.305699481865285, + "grad_norm": 18.54206032224653, + "learning_rate": 1.1912343035907535e-05, + "loss": 1.0929, + "step": 504 + }, + { + "epoch": 1.3082901554404145, + "grad_norm": 38.9386007237313, + "learning_rate": 1.1833913155122132e-05, + "loss": 1.1381, + "step": 505 + }, + { + "epoch": 1.310880829015544, + "grad_norm": 37.05899458809635, + "learning_rate": 1.1755633689254609e-05, + "loss": 1.0535, + "step": 506 + }, + { + "epoch": 1.3134715025906736, + "grad_norm": 27.716372794195156, + "learning_rate": 1.1677506080170512e-05, + "loss": 1.1342, + "step": 507 + }, + { + "epoch": 1.3160621761658031, + "grad_norm": 40.42306246079416, + "learning_rate": 1.1599531766938306e-05, + "loss": 1.0887, + "step": 508 + }, + { + "epoch": 1.3186528497409327, + "grad_norm": 98.56681767405578, + "learning_rate": 1.1521712185802789e-05, + "loss": 1.0954, + "step": 509 + }, + { + "epoch": 1.3212435233160622, + "grad_norm": 34.42816933350743, + "learning_rate": 1.1444048770158718e-05, + "loss": 1.0512, + "step": 510 + }, + { + "epoch": 1.3238341968911918, + "grad_norm": 52.457523653614096, + "learning_rate": 1.136654295052433e-05, + "loss": 1.1599, + "step": 511 + }, + { + "epoch": 1.3264248704663213, + "grad_norm": 26.832339531661276, + "learning_rate": 1.1289196154515048e-05, + "loss": 1.0602, + "step": 512 + }, + { + "epoch": 1.3290155440414508, + "grad_norm": 32.746047673769816, + "learning_rate": 1.1212009806817163e-05, + "loss": 1.1544, + "step": 513 + }, + { + "epoch": 1.3316062176165804, + "grad_norm": 37.44483451702055, + "learning_rate": 1.1134985329161608e-05, + "loss": 1.1421, + "step": 514 + }, + { + "epoch": 1.33419689119171, + "grad_norm": 28.625976525737606, + "learning_rate": 1.1058124140297718e-05, + "loss": 1.0858, + "step": 515 + }, + { + "epoch": 1.3367875647668392, + "grad_norm": 38.64141195246213, + "learning_rate": 1.0981427655967183e-05, + "loss": 1.0983, + "step": 516 + }, + { + "epoch": 1.3393782383419688, + "grad_norm": 29.989753893533425, + "learning_rate": 1.0904897288877891e-05, + "loss": 1.1269, + "step": 517 + }, + { + "epoch": 1.3419689119170983, + "grad_norm": 48.63990665515511, + "learning_rate": 1.0828534448677942e-05, + "loss": 1.0844, + "step": 518 + }, + { + "epoch": 1.3445595854922279, + "grad_norm": 25.477227318250847, + "learning_rate": 1.0752340541929711e-05, + "loss": 1.0742, + "step": 519 + }, + { + "epoch": 1.3471502590673574, + "grad_norm": 26.363588814537763, + "learning_rate": 1.0676316972083867e-05, + "loss": 1.0533, + "step": 520 + }, + { + "epoch": 1.349740932642487, + "grad_norm": 34.59968737708606, + "learning_rate": 1.060046513945361e-05, + "loss": 1.0983, + "step": 521 + }, + { + "epoch": 1.3523316062176165, + "grad_norm": 52.51652561846762, + "learning_rate": 1.0524786441188786e-05, + "loss": 1.1319, + "step": 522 + }, + { + "epoch": 1.354922279792746, + "grad_norm": 21.360221214301127, + "learning_rate": 1.0449282271250239e-05, + "loss": 1.0627, + "step": 523 + }, + { + "epoch": 1.3575129533678756, + "grad_norm": 37.00053933682603, + "learning_rate": 1.0373954020384073e-05, + "loss": 1.096, + "step": 524 + }, + { + "epoch": 1.3601036269430051, + "grad_norm": 39.212240822687484, + "learning_rate": 1.029880307609608e-05, + "loss": 1.0512, + "step": 525 + }, + { + "epoch": 1.3626943005181347, + "grad_norm": 24.89842378385804, + "learning_rate": 1.0223830822626124e-05, + "loss": 1.0538, + "step": 526 + }, + { + "epoch": 1.3652849740932642, + "grad_norm": 29.14416894424653, + "learning_rate": 1.0149038640922715e-05, + "loss": 1.1538, + "step": 527 + }, + { + "epoch": 1.3678756476683938, + "grad_norm": 31.688722122648855, + "learning_rate": 1.0074427908617515e-05, + "loss": 1.171, + "step": 528 + }, + { + "epoch": 1.3704663212435233, + "grad_norm": 41.918909004413734, + "learning_rate": 1.0000000000000006e-05, + "loss": 1.1203, + "step": 529 + }, + { + "epoch": 1.3730569948186528, + "grad_norm": 26.70963454516576, + "learning_rate": 9.92575628599213e-06, + "loss": 1.0855, + "step": 530 + }, + { + "epoch": 1.3756476683937824, + "grad_norm": 24.819351173466824, + "learning_rate": 9.851698134123095e-06, + "loss": 1.0972, + "step": 531 + }, + { + "epoch": 1.378238341968912, + "grad_norm": 22.100465399566815, + "learning_rate": 9.777826908504126e-06, + "loss": 1.08, + "step": 532 + }, + { + "epoch": 1.3808290155440415, + "grad_norm": 29.31574709406259, + "learning_rate": 9.704143969803392e-06, + "loss": 1.0835, + "step": 533 + }, + { + "epoch": 1.383419689119171, + "grad_norm": 25.551326748473052, + "learning_rate": 9.630650675220892e-06, + "loss": 1.0396, + "step": 534 + }, + { + "epoch": 1.3860103626943006, + "grad_norm": 59.07595627892596, + "learning_rate": 9.557348378463503e-06, + "loss": 1.0814, + "step": 535 + }, + { + "epoch": 1.38860103626943, + "grad_norm": 24.96501978981908, + "learning_rate": 9.484238429720018e-06, + "loss": 1.0187, + "step": 536 + }, + { + "epoch": 1.3911917098445596, + "grad_norm": 42.530604702279234, + "learning_rate": 9.411322175636298e-06, + "loss": 1.074, + "step": 537 + }, + { + "epoch": 1.3937823834196892, + "grad_norm": 34.91129065632851, + "learning_rate": 9.338600959290414e-06, + "loss": 1.0878, + "step": 538 + }, + { + "epoch": 1.3963730569948187, + "grad_norm": 32.07525956876426, + "learning_rate": 9.266076120167992e-06, + "loss": 1.0962, + "step": 539 + }, + { + "epoch": 1.3989637305699483, + "grad_norm": 40.18387743296675, + "learning_rate": 9.193748994137462e-06, + "loss": 1.1033, + "step": 540 + }, + { + "epoch": 1.4015544041450778, + "grad_norm": 66.68031460980451, + "learning_rate": 9.121620913425508e-06, + "loss": 1.1466, + "step": 541 + }, + { + "epoch": 1.4041450777202074, + "grad_norm": 34.07506059584738, + "learning_rate": 9.04969320659249e-06, + "loss": 1.1184, + "step": 542 + }, + { + "epoch": 1.406735751295337, + "grad_norm": 17.130845779169075, + "learning_rate": 8.977967198508001e-06, + "loss": 1.0803, + "step": 543 + }, + { + "epoch": 1.4093264248704664, + "grad_norm": 22.4457025132615, + "learning_rate": 8.906444210326441e-06, + "loss": 1.0745, + "step": 544 + }, + { + "epoch": 1.411917098445596, + "grad_norm": 73.43971735356851, + "learning_rate": 8.83512555946271e-06, + "loss": 1.0717, + "step": 545 + }, + { + "epoch": 1.4145077720207253, + "grad_norm": 38.16321297719761, + "learning_rate": 8.764012559567899e-06, + "loss": 1.1371, + "step": 546 + }, + { + "epoch": 1.4170984455958548, + "grad_norm": 56.14718024907725, + "learning_rate": 8.693106520505147e-06, + "loss": 1.0185, + "step": 547 + }, + { + "epoch": 1.4196891191709844, + "grad_norm": 53.3812598790062, + "learning_rate": 8.622408748325461e-06, + "loss": 1.0859, + "step": 548 + }, + { + "epoch": 1.422279792746114, + "grad_norm": 39.69041631433326, + "learning_rate": 8.551920545243704e-06, + "loss": 1.1146, + "step": 549 + }, + { + "epoch": 1.4248704663212435, + "grad_norm": 24.099260758984773, + "learning_rate": 8.481643209614576e-06, + "loss": 1.0968, + "step": 550 + }, + { + "epoch": 1.427461139896373, + "grad_norm": 22.623850373369237, + "learning_rate": 8.411578035908728e-06, + "loss": 1.0642, + "step": 551 + }, + { + "epoch": 1.4300518134715026, + "grad_norm": 25.343746374404027, + "learning_rate": 8.341726314688875e-06, + "loss": 1.0815, + "step": 552 + }, + { + "epoch": 1.432642487046632, + "grad_norm": 35.82641011588973, + "learning_rate": 8.272089332586089e-06, + "loss": 1.1012, + "step": 553 + }, + { + "epoch": 1.4352331606217616, + "grad_norm": 24.81161215784662, + "learning_rate": 8.20266837227603e-06, + "loss": 1.1086, + "step": 554 + }, + { + "epoch": 1.4378238341968912, + "grad_norm": 54.18243481591251, + "learning_rate": 8.133464712455364e-06, + "loss": 1.0704, + "step": 555 + }, + { + "epoch": 1.4404145077720207, + "grad_norm": 23.602598217141395, + "learning_rate": 8.064479627818213e-06, + "loss": 1.1519, + "step": 556 + }, + { + "epoch": 1.4430051813471503, + "grad_norm": 31.124404868409982, + "learning_rate": 7.995714389032638e-06, + "loss": 1.0705, + "step": 557 + }, + { + "epoch": 1.4455958549222798, + "grad_norm": 24.14171016995626, + "learning_rate": 7.927170262717284e-06, + "loss": 1.1083, + "step": 558 + }, + { + "epoch": 1.4481865284974094, + "grad_norm": 47.987203109917175, + "learning_rate": 7.858848511417998e-06, + "loss": 1.0836, + "step": 559 + }, + { + "epoch": 1.450777202072539, + "grad_norm": 25.871447098066056, + "learning_rate": 7.790750393584616e-06, + "loss": 1.0787, + "step": 560 + }, + { + "epoch": 1.4533678756476685, + "grad_norm": 23.820249113937482, + "learning_rate": 7.72287716354776e-06, + "loss": 1.1165, + "step": 561 + }, + { + "epoch": 1.455958549222798, + "grad_norm": 48.04131308947624, + "learning_rate": 7.65523007149575e-06, + "loss": 1.0819, + "step": 562 + }, + { + "epoch": 1.4585492227979275, + "grad_norm": 29.273494083692352, + "learning_rate": 7.587810363451544e-06, + "loss": 1.0302, + "step": 563 + }, + { + "epoch": 1.4611398963730569, + "grad_norm": 120.01571222366722, + "learning_rate": 7.5206192812498345e-06, + "loss": 1.1291, + "step": 564 + }, + { + "epoch": 1.4637305699481864, + "grad_norm": 33.16947662083338, + "learning_rate": 7.4536580625141244e-06, + "loss": 1.0842, + "step": 565 + }, + { + "epoch": 1.466321243523316, + "grad_norm": 29.979556378166713, + "learning_rate": 7.386927940633981e-06, + "loss": 1.1116, + "step": 566 + }, + { + "epoch": 1.4689119170984455, + "grad_norm": 27.172344859281896, + "learning_rate": 7.32043014474227e-06, + "loss": 1.0676, + "step": 567 + }, + { + "epoch": 1.471502590673575, + "grad_norm": 30.208548637757318, + "learning_rate": 7.254165899692554e-06, + "loss": 1.1104, + "step": 568 + }, + { + "epoch": 1.4740932642487046, + "grad_norm": 19.385421184583773, + "learning_rate": 7.188136426036498e-06, + "loss": 1.0085, + "step": 569 + }, + { + "epoch": 1.4766839378238341, + "grad_norm": 30.350787749309685, + "learning_rate": 7.12234294000143e-06, + "loss": 1.0584, + "step": 570 + }, + { + "epoch": 1.4792746113989637, + "grad_norm": 31.520305600900198, + "learning_rate": 7.056786653467882e-06, + "loss": 1.0831, + "step": 571 + }, + { + "epoch": 1.4818652849740932, + "grad_norm": 46.13006972574487, + "learning_rate": 6.991468773947321e-06, + "loss": 1.1761, + "step": 572 + }, + { + "epoch": 1.4844559585492227, + "grad_norm": 26.72340868362835, + "learning_rate": 6.926390504559879e-06, + "loss": 1.0605, + "step": 573 + }, + { + "epoch": 1.4870466321243523, + "grad_norm": 25.992965411102556, + "learning_rate": 6.861553044012206e-06, + "loss": 1.1015, + "step": 574 + }, + { + "epoch": 1.4896373056994818, + "grad_norm": 38.60187420279626, + "learning_rate": 6.796957586575364e-06, + "loss": 1.1232, + "step": 575 + }, + { + "epoch": 1.4922279792746114, + "grad_norm": 21.7618591565717, + "learning_rate": 6.732605322062869e-06, + "loss": 1.1196, + "step": 576 + }, + { + "epoch": 1.494818652849741, + "grad_norm": 28.233093007170996, + "learning_rate": 6.668497435808736e-06, + "loss": 1.1451, + "step": 577 + }, + { + "epoch": 1.4974093264248705, + "grad_norm": 28.061514297823816, + "learning_rate": 6.604635108645683e-06, + "loss": 1.0832, + "step": 578 + }, + { + "epoch": 1.5, + "grad_norm": 35.34503147975386, + "learning_rate": 6.5410195168833425e-06, + "loss": 1.118, + "step": 579 + }, + { + "epoch": 1.5025906735751295, + "grad_norm": 31.940516004139344, + "learning_rate": 6.477651832286633e-06, + "loss": 1.1052, + "step": 580 + }, + { + "epoch": 1.505181347150259, + "grad_norm": 25.647504733675635, + "learning_rate": 6.414533222054138e-06, + "loss": 1.1055, + "step": 581 + }, + { + "epoch": 1.5077720207253886, + "grad_norm": 68.16422579698298, + "learning_rate": 6.3516648487966456e-06, + "loss": 1.0784, + "step": 582 + }, + { + "epoch": 1.5077720207253886, + "eval_loss": 1.0824710130691528, + "eval_runtime": 37.4923, + "eval_samples_per_second": 19.844, + "eval_steps_per_second": 1.254, + "step": 582 + }, + { + "epoch": 1.5103626943005182, + "grad_norm": 46.95363643283118, + "learning_rate": 6.289047870515692e-06, + "loss": 1.1271, + "step": 583 + }, + { + "epoch": 1.5129533678756477, + "grad_norm": 37.80701104174098, + "learning_rate": 6.226683440582268e-06, + "loss": 1.126, + "step": 584 + }, + { + "epoch": 1.5155440414507773, + "grad_norm": 32.03225059321182, + "learning_rate": 6.164572707715564e-06, + "loss": 1.0152, + "step": 585 + }, + { + "epoch": 1.5181347150259068, + "grad_norm": 31.21438627768379, + "learning_rate": 6.102716815961787e-06, + "loss": 1.1595, + "step": 586 + }, + { + "epoch": 1.5207253886010363, + "grad_norm": 23.55515793723355, + "learning_rate": 6.041116904673125e-06, + "loss": 1.0943, + "step": 587 + }, + { + "epoch": 1.5233160621761659, + "grad_norm": 26.92022994571063, + "learning_rate": 5.979774108486751e-06, + "loss": 1.0554, + "step": 588 + }, + { + "epoch": 1.5259067357512954, + "grad_norm": 24.957086694295352, + "learning_rate": 5.918689557303885e-06, + "loss": 1.0711, + "step": 589 + }, + { + "epoch": 1.528497409326425, + "grad_norm": 87.48440577770464, + "learning_rate": 5.857864376269051e-06, + "loss": 1.1679, + "step": 590 + }, + { + "epoch": 1.5310880829015545, + "grad_norm": 21.756969247026838, + "learning_rate": 5.7972996857492896e-06, + "loss": 1.0716, + "step": 591 + }, + { + "epoch": 1.533678756476684, + "grad_norm": 33.92695136944769, + "learning_rate": 5.736996601313545e-06, + "loss": 1.0376, + "step": 592 + }, + { + "epoch": 1.5362694300518136, + "grad_norm": 32.738888590276794, + "learning_rate": 5.676956233712139e-06, + "loss": 1.0245, + "step": 593 + }, + { + "epoch": 1.5388601036269431, + "grad_norm": 22.38597679049821, + "learning_rate": 5.617179688856271e-06, + "loss": 1.1103, + "step": 594 + }, + { + "epoch": 1.5414507772020727, + "grad_norm": 30.168619654124416, + "learning_rate": 5.557668067797677e-06, + "loss": 1.2007, + "step": 595 + }, + { + "epoch": 1.5440414507772022, + "grad_norm": 24.460334668593116, + "learning_rate": 5.498422466708349e-06, + "loss": 1.0842, + "step": 596 + }, + { + "epoch": 1.5466321243523318, + "grad_norm": 25.877463433966412, + "learning_rate": 5.439443976860306e-06, + "loss": 1.0537, + "step": 597 + }, + { + "epoch": 1.549222797927461, + "grad_norm": 27.67111694532404, + "learning_rate": 5.38073368460555e-06, + "loss": 1.0863, + "step": 598 + }, + { + "epoch": 1.5518134715025906, + "grad_norm": 43.112045139256026, + "learning_rate": 5.32229267135602e-06, + "loss": 1.1168, + "step": 599 + }, + { + "epoch": 1.5544041450777202, + "grad_norm": 31.60344278763487, + "learning_rate": 5.2641220135636685e-06, + "loss": 1.0939, + "step": 600 + }, + { + "epoch": 1.5569948186528497, + "grad_norm": 37.795536334167195, + "learning_rate": 5.206222782700667e-06, + "loss": 1.1084, + "step": 601 + }, + { + "epoch": 1.5595854922279793, + "grad_norm": 27.529824319458413, + "learning_rate": 5.1485960452396266e-06, + "loss": 1.0755, + "step": 602 + }, + { + "epoch": 1.5621761658031088, + "grad_norm": 29.172376961452496, + "learning_rate": 5.091242862634e-06, + "loss": 1.0231, + "step": 603 + }, + { + "epoch": 1.5647668393782384, + "grad_norm": 24.94560254083931, + "learning_rate": 5.0341642912984844e-06, + "loss": 1.0782, + "step": 604 + }, + { + "epoch": 1.567357512953368, + "grad_norm": 31.79546143794924, + "learning_rate": 4.977361382589607e-06, + "loss": 1.1202, + "step": 605 + }, + { + "epoch": 1.5699481865284974, + "grad_norm": 39.3795372477718, + "learning_rate": 4.920835182786316e-06, + "loss": 1.0349, + "step": 606 + }, + { + "epoch": 1.572538860103627, + "grad_norm": 31.308429467189708, + "learning_rate": 4.864586733070755e-06, + "loss": 1.0582, + "step": 607 + }, + { + "epoch": 1.5751295336787565, + "grad_norm": 32.82748366949945, + "learning_rate": 4.808617069509034e-06, + "loss": 1.1246, + "step": 608 + }, + { + "epoch": 1.577720207253886, + "grad_norm": 24.281936328515055, + "learning_rate": 4.752927223032196e-06, + "loss": 1.0679, + "step": 609 + }, + { + "epoch": 1.5803108808290154, + "grad_norm": 111.23884469313498, + "learning_rate": 4.697518219417188e-06, + "loss": 1.1319, + "step": 610 + }, + { + "epoch": 1.582901554404145, + "grad_norm": 35.484299416160596, + "learning_rate": 4.6423910792680005e-06, + "loss": 1.1348, + "step": 611 + }, + { + "epoch": 1.5854922279792745, + "grad_norm": 27.135342529418295, + "learning_rate": 4.587546817996826e-06, + "loss": 1.0948, + "step": 612 + }, + { + "epoch": 1.588082901554404, + "grad_norm": 81.98158494527004, + "learning_rate": 4.532986445805405e-06, + "loss": 1.0864, + "step": 613 + }, + { + "epoch": 1.5906735751295336, + "grad_norm": 61.490418707157346, + "learning_rate": 4.478710967666371e-06, + "loss": 1.0693, + "step": 614 + }, + { + "epoch": 1.593264248704663, + "grad_norm": 25.633018846282962, + "learning_rate": 4.424721383304791e-06, + "loss": 1.1084, + "step": 615 + }, + { + "epoch": 1.5958549222797926, + "grad_norm": 28.194280804517373, + "learning_rate": 4.371018687179689e-06, + "loss": 1.1722, + "step": 616 + }, + { + "epoch": 1.5984455958549222, + "grad_norm": 27.8080566828581, + "learning_rate": 4.317603868465794e-06, + "loss": 1.1171, + "step": 617 + }, + { + "epoch": 1.6010362694300517, + "grad_norm": 42.959036729178806, + "learning_rate": 4.264477911035265e-06, + "loss": 1.074, + "step": 618 + }, + { + "epoch": 1.6036269430051813, + "grad_norm": 23.937218136554392, + "learning_rate": 4.211641793439609e-06, + "loss": 1.13, + "step": 619 + }, + { + "epoch": 1.6062176165803108, + "grad_norm": 43.913677975121566, + "learning_rate": 4.159096488891623e-06, + "loss": 1.1671, + "step": 620 + }, + { + "epoch": 1.6088082901554404, + "grad_norm": 48.107566289352114, + "learning_rate": 4.106842965247497e-06, + "loss": 1.1071, + "step": 621 + }, + { + "epoch": 1.61139896373057, + "grad_norm": 28.25790913819402, + "learning_rate": 4.054882184988971e-06, + "loss": 1.0716, + "step": 622 + }, + { + "epoch": 1.6139896373056994, + "grad_norm": 26.59960827233381, + "learning_rate": 4.003215105205613e-06, + "loss": 1.146, + "step": 623 + }, + { + "epoch": 1.616580310880829, + "grad_norm": 22.79614250574067, + "learning_rate": 3.951842677577171e-06, + "loss": 1.0761, + "step": 624 + }, + { + "epoch": 1.6191709844559585, + "grad_norm": 24.24036779343114, + "learning_rate": 3.900765848356083e-06, + "loss": 1.1037, + "step": 625 + }, + { + "epoch": 1.621761658031088, + "grad_norm": 27.295669679621373, + "learning_rate": 3.849985558349998e-06, + "loss": 1.1015, + "step": 626 + }, + { + "epoch": 1.6243523316062176, + "grad_norm": 54.413225233914176, + "learning_rate": 3.799502742904497e-06, + "loss": 1.0318, + "step": 627 + }, + { + "epoch": 1.6269430051813472, + "grad_norm": 38.84848713400369, + "learning_rate": 3.749318331885825e-06, + "loss": 1.1147, + "step": 628 + }, + { + "epoch": 1.6295336787564767, + "grad_norm": 23.912199342429506, + "learning_rate": 3.699433249663775e-06, + "loss": 1.1439, + "step": 629 + }, + { + "epoch": 1.6321243523316062, + "grad_norm": 48.95526983090661, + "learning_rate": 3.649848415094681e-06, + "loss": 1.0229, + "step": 630 + }, + { + "epoch": 1.6347150259067358, + "grad_norm": 32.099897123524585, + "learning_rate": 3.60056474150446e-06, + "loss": 1.0589, + "step": 631 + }, + { + "epoch": 1.6373056994818653, + "grad_norm": 31.802660850585973, + "learning_rate": 3.551583136671817e-06, + "loss": 1.1137, + "step": 632 + }, + { + "epoch": 1.6398963730569949, + "grad_norm": 34.2655686599537, + "learning_rate": 3.5029045028115105e-06, + "loss": 1.1318, + "step": 633 + }, + { + "epoch": 1.6424870466321244, + "grad_norm": 191.48847051006786, + "learning_rate": 3.4545297365577437e-06, + "loss": 1.0921, + "step": 634 + }, + { + "epoch": 1.645077720207254, + "grad_norm": 24.236450154622357, + "learning_rate": 3.406459728947622e-06, + "loss": 1.0851, + "step": 635 + }, + { + "epoch": 1.6476683937823835, + "grad_norm": 38.819342476228876, + "learning_rate": 3.358695365404785e-06, + "loss": 1.0962, + "step": 636 + }, + { + "epoch": 1.650259067357513, + "grad_norm": 31.53545103406636, + "learning_rate": 3.3112375257230547e-06, + "loss": 1.0994, + "step": 637 + }, + { + "epoch": 1.6528497409326426, + "grad_norm": 71.55299438562814, + "learning_rate": 3.2640870840502646e-06, + "loss": 1.08, + "step": 638 + }, + { + "epoch": 1.6554404145077721, + "grad_norm": 57.94234006640972, + "learning_rate": 3.2172449088721235e-06, + "loss": 1.0921, + "step": 639 + }, + { + "epoch": 1.6580310880829017, + "grad_norm": 58.15229256885828, + "learning_rate": 3.1707118629962607e-06, + "loss": 1.0981, + "step": 640 + }, + { + "epoch": 1.6606217616580312, + "grad_norm": 25.105795165561457, + "learning_rate": 3.1244888035362875e-06, + "loss": 1.101, + "step": 641 + }, + { + "epoch": 1.6632124352331608, + "grad_norm": 33.15366058006866, + "learning_rate": 3.0785765818960534e-06, + "loss": 1.0517, + "step": 642 + }, + { + "epoch": 1.6658031088082903, + "grad_norm": 35.79893709161297, + "learning_rate": 3.0329760437539233e-06, + "loss": 1.0886, + "step": 643 + }, + { + "epoch": 1.6683937823834198, + "grad_norm": 49.59918009099835, + "learning_rate": 2.9876880290472376e-06, + "loss": 1.0756, + "step": 644 + }, + { + "epoch": 1.6709844559585494, + "grad_norm": 21.485142494367135, + "learning_rate": 2.942713371956809e-06, + "loss": 1.1017, + "step": 645 + }, + { + "epoch": 1.6735751295336787, + "grad_norm": 29.23169287520316, + "learning_rate": 2.8980529008915793e-06, + "loss": 1.1241, + "step": 646 + }, + { + "epoch": 1.6761658031088082, + "grad_norm": 27.913868608886553, + "learning_rate": 2.853707438473352e-06, + "loss": 1.0841, + "step": 647 + }, + { + "epoch": 1.6787564766839378, + "grad_norm": 18.438597602055644, + "learning_rate": 2.8096778015216484e-06, + "loss": 1.0891, + "step": 648 + }, + { + "epoch": 1.6813471502590673, + "grad_norm": 54.0556941620233, + "learning_rate": 2.7659648010386365e-06, + "loss": 1.0589, + "step": 649 + }, + { + "epoch": 1.6839378238341969, + "grad_norm": 108.10101848740734, + "learning_rate": 2.7225692421942306e-06, + "loss": 1.0766, + "step": 650 + }, + { + "epoch": 1.6865284974093264, + "grad_norm": 106.58835736628185, + "learning_rate": 2.679491924311226e-06, + "loss": 1.1144, + "step": 651 + }, + { + "epoch": 1.689119170984456, + "grad_norm": 31.53371570516213, + "learning_rate": 2.6367336408506063e-06, + "loss": 1.02, + "step": 652 + }, + { + "epoch": 1.6917098445595855, + "grad_norm": 36.263088086669775, + "learning_rate": 2.594295179396895e-06, + "loss": 1.0679, + "step": 653 + }, + { + "epoch": 1.694300518134715, + "grad_norm": 24.47507184337666, + "learning_rate": 2.5521773216436875e-06, + "loss": 1.1092, + "step": 654 + }, + { + "epoch": 1.6968911917098446, + "grad_norm": 33.05899532106974, + "learning_rate": 2.5103808433792075e-06, + "loss": 1.053, + "step": 655 + }, + { + "epoch": 1.6994818652849741, + "grad_norm": 29.132344102799873, + "learning_rate": 2.468906514472065e-06, + "loss": 1.0518, + "step": 656 + }, + { + "epoch": 1.7020725388601037, + "grad_norm": 43.48960854254409, + "learning_rate": 2.4277550988570362e-06, + "loss": 1.0537, + "step": 657 + }, + { + "epoch": 1.704663212435233, + "grad_norm": 28.13627467897817, + "learning_rate": 2.3869273545210158e-06, + "loss": 1.0558, + "step": 658 + }, + { + "epoch": 1.7072538860103625, + "grad_norm": 33.18164212520423, + "learning_rate": 2.3464240334890496e-06, + "loss": 1.054, + "step": 659 + }, + { + "epoch": 1.709844559585492, + "grad_norm": 41.884394437273144, + "learning_rate": 2.3062458818104804e-06, + "loss": 1.0871, + "step": 660 + }, + { + "epoch": 1.7124352331606216, + "grad_norm": 27.119840736470916, + "learning_rate": 2.266393639545197e-06, + "loss": 1.0743, + "step": 661 + }, + { + "epoch": 1.7150259067357512, + "grad_norm": 20.70474999023591, + "learning_rate": 2.22686804075003e-06, + "loss": 1.0718, + "step": 662 + }, + { + "epoch": 1.7176165803108807, + "grad_norm": 21.469651089617198, + "learning_rate": 2.187669813465192e-06, + "loss": 1.0584, + "step": 663 + }, + { + "epoch": 1.7202072538860103, + "grad_norm": 29.901704269591495, + "learning_rate": 2.1487996797009103e-06, + "loss": 1.1175, + "step": 664 + }, + { + "epoch": 1.7227979274611398, + "grad_norm": 75.06310533674302, + "learning_rate": 2.110258355424093e-06, + "loss": 1.124, + "step": 665 + }, + { + "epoch": 1.7253886010362693, + "grad_norm": 34.13349153293387, + "learning_rate": 2.0720465505451524e-06, + "loss": 1.1395, + "step": 666 + }, + { + "epoch": 1.7279792746113989, + "grad_norm": 26.83922350447555, + "learning_rate": 2.0341649689049458e-06, + "loss": 1.0449, + "step": 667 + }, + { + "epoch": 1.7305699481865284, + "grad_norm": 37.284339589086024, + "learning_rate": 1.9966143082617797e-06, + "loss": 1.0332, + "step": 668 + }, + { + "epoch": 1.733160621761658, + "grad_norm": 46.453238969399074, + "learning_rate": 1.959395260278587e-06, + "loss": 1.1303, + "step": 669 + }, + { + "epoch": 1.7357512953367875, + "grad_norm": 22.743791018223284, + "learning_rate": 1.922508510510166e-06, + "loss": 1.0993, + "step": 670 + }, + { + "epoch": 1.738341968911917, + "grad_norm": 27.788137087891727, + "learning_rate": 1.885954738390572e-06, + "loss": 1.1234, + "step": 671 + }, + { + "epoch": 1.7409326424870466, + "grad_norm": 34.03637743502625, + "learning_rate": 1.8497346172205733e-06, + "loss": 1.085, + "step": 672 + }, + { + "epoch": 1.7435233160621761, + "grad_norm": 30.308363072599853, + "learning_rate": 1.8138488141552856e-06, + "loss": 1.0348, + "step": 673 + }, + { + "epoch": 1.7461139896373057, + "grad_norm": 26.81612464278571, + "learning_rate": 1.7782979901918507e-06, + "loss": 1.0672, + "step": 674 + }, + { + "epoch": 1.7487046632124352, + "grad_norm": 46.96340147563577, + "learning_rate": 1.7430828001572897e-06, + "loss": 1.0807, + "step": 675 + }, + { + "epoch": 1.7512953367875648, + "grad_norm": 30.87064631308438, + "learning_rate": 1.7082038926964162e-06, + "loss": 1.1411, + "step": 676 + }, + { + "epoch": 1.7538860103626943, + "grad_norm": 79.59411718865987, + "learning_rate": 1.6736619102599073e-06, + "loss": 1.0234, + "step": 677 + }, + { + "epoch": 1.7564766839378239, + "grad_norm": 30.875792565440594, + "learning_rate": 1.6394574890924574e-06, + "loss": 1.1506, + "step": 678 + }, + { + "epoch": 1.7590673575129534, + "grad_norm": 34.227935587917464, + "learning_rate": 1.605591259221071e-06, + "loss": 1.0981, + "step": 679 + }, + { + "epoch": 1.7590673575129534, + "eval_loss": 1.0809757709503174, + "eval_runtime": 37.9729, + "eval_samples_per_second": 19.593, + "eval_steps_per_second": 1.238, + "step": 679 + }, + { + "epoch": 1.761658031088083, + "grad_norm": 31.849171622198522, + "learning_rate": 1.572063844443441e-06, + "loss": 1.1227, + "step": 680 + }, + { + "epoch": 1.7642487046632125, + "grad_norm": 32.75765881856165, + "learning_rate": 1.5388758623164802e-06, + "loss": 1.0842, + "step": 681 + }, + { + "epoch": 1.766839378238342, + "grad_norm": 27.83779558188967, + "learning_rate": 1.5060279241449304e-06, + "loss": 1.0419, + "step": 682 + }, + { + "epoch": 1.7694300518134716, + "grad_norm": 30.646833576522408, + "learning_rate": 1.4735206349701003e-06, + "loss": 1.0983, + "step": 683 + }, + { + "epoch": 1.7720207253886011, + "grad_norm": 29.748071428344947, + "learning_rate": 1.4413545935587415e-06, + "loss": 1.1276, + "step": 684 + }, + { + "epoch": 1.7746113989637307, + "grad_norm": 32.57104117085742, + "learning_rate": 1.4095303923919956e-06, + "loss": 1.0728, + "step": 685 + }, + { + "epoch": 1.7772020725388602, + "grad_norm": 32.02209671450587, + "learning_rate": 1.3780486176544905e-06, + "loss": 1.1148, + "step": 686 + }, + { + "epoch": 1.7797927461139897, + "grad_norm": 31.902388050458736, + "learning_rate": 1.3469098492235521e-06, + "loss": 1.0873, + "step": 687 + }, + { + "epoch": 1.7823834196891193, + "grad_norm": 33.159581668201604, + "learning_rate": 1.316114660658505e-06, + "loss": 1.0308, + "step": 688 + }, + { + "epoch": 1.7849740932642488, + "grad_norm": 25.531240947030152, + "learning_rate": 1.2856636191901296e-06, + "loss": 1.0893, + "step": 689 + }, + { + "epoch": 1.7875647668393784, + "grad_norm": 25.382870674663973, + "learning_rate": 1.255557285710185e-06, + "loss": 1.1089, + "step": 690 + }, + { + "epoch": 1.790155440414508, + "grad_norm": 26.184606368046406, + "learning_rate": 1.225796214761117e-06, + "loss": 1.1515, + "step": 691 + }, + { + "epoch": 1.7927461139896375, + "grad_norm": 27.78595815725415, + "learning_rate": 1.196380954525802e-06, + "loss": 1.0871, + "step": 692 + }, + { + "epoch": 1.795336787564767, + "grad_norm": 32.137607036645285, + "learning_rate": 1.1673120468174837e-06, + "loss": 1.1396, + "step": 693 + }, + { + "epoch": 1.7979274611398963, + "grad_norm": 31.931928767500203, + "learning_rate": 1.1385900270697658e-06, + "loss": 1.1175, + "step": 694 + }, + { + "epoch": 1.8005181347150259, + "grad_norm": 36.61199052966244, + "learning_rate": 1.110215424326775e-06, + "loss": 1.1867, + "step": 695 + }, + { + "epoch": 1.8031088082901554, + "grad_norm": 49.9081839820131, + "learning_rate": 1.0821887612333959e-06, + "loss": 1.1266, + "step": 696 + }, + { + "epoch": 1.805699481865285, + "grad_norm": 25.346034138603734, + "learning_rate": 1.0545105540256628e-06, + "loss": 1.0614, + "step": 697 + }, + { + "epoch": 1.8082901554404145, + "grad_norm": 47.53838459679947, + "learning_rate": 1.0271813125212237e-06, + "loss": 1.1314, + "step": 698 + }, + { + "epoch": 1.810880829015544, + "grad_norm": 30.496460286583815, + "learning_rate": 1.0002015401099797e-06, + "loss": 1.1067, + "step": 699 + }, + { + "epoch": 1.8134715025906736, + "grad_norm": 29.929097539381686, + "learning_rate": 9.735717337447981e-07, + "loss": 1.0424, + "step": 700 + }, + { + "epoch": 1.8160621761658031, + "grad_norm": 30.887132457194266, + "learning_rate": 9.4729238393235e-07, + "loss": 1.1248, + "step": 701 + }, + { + "epoch": 1.8186528497409327, + "grad_norm": 24.26916275448189, + "learning_rate": 9.21363974724101e-07, + "loss": 1.0577, + "step": 702 + }, + { + "epoch": 1.8212435233160622, + "grad_norm": 40.34641617989283, + "learning_rate": 8.957869837073673e-07, + "loss": 1.1639, + "step": 703 + }, + { + "epoch": 1.8238341968911918, + "grad_norm": 34.3133374466777, + "learning_rate": 8.705618819965411e-07, + "loss": 1.0866, + "step": 704 + }, + { + "epoch": 1.8264248704663213, + "grad_norm": 25.164299615685284, + "learning_rate": 8.456891342243945e-07, + "loss": 1.1232, + "step": 705 + }, + { + "epoch": 1.8290155440414506, + "grad_norm": 129.91297199628124, + "learning_rate": 8.211691985335357e-07, + "loss": 1.1542, + "step": 706 + }, + { + "epoch": 1.8316062176165802, + "grad_norm": 23.928927141144797, + "learning_rate": 7.970025265679648e-07, + "loss": 1.0813, + "step": 707 + }, + { + "epoch": 1.8341968911917097, + "grad_norm": 22.631504479886225, + "learning_rate": 7.731895634647513e-07, + "loss": 1.1164, + "step": 708 + }, + { + "epoch": 1.8367875647668392, + "grad_norm": 84.2359250723018, + "learning_rate": 7.497307478458382e-07, + "loss": 1.1081, + "step": 709 + }, + { + "epoch": 1.8393782383419688, + "grad_norm": 51.39142883893451, + "learning_rate": 7.266265118099669e-07, + "loss": 1.105, + "step": 710 + }, + { + "epoch": 1.8419689119170983, + "grad_norm": 41.18280727079993, + "learning_rate": 7.038772809247075e-07, + "loss": 1.1211, + "step": 711 + }, + { + "epoch": 1.8445595854922279, + "grad_norm": 34.330855277813534, + "learning_rate": 6.814834742186361e-07, + "loss": 1.0783, + "step": 712 + }, + { + "epoch": 1.8471502590673574, + "grad_norm": 46.858780552576334, + "learning_rate": 6.594455041735925e-07, + "loss": 1.0214, + "step": 713 + }, + { + "epoch": 1.849740932642487, + "grad_norm": 94.2712798319484, + "learning_rate": 6.377637767171152e-07, + "loss": 1.098, + "step": 714 + }, + { + "epoch": 1.8523316062176165, + "grad_norm": 33.00073975184253, + "learning_rate": 6.164386912149289e-07, + "loss": 1.0906, + "step": 715 + }, + { + "epoch": 1.854922279792746, + "grad_norm": 30.030119862133272, + "learning_rate": 5.954706404636179e-07, + "loss": 1.1073, + "step": 716 + }, + { + "epoch": 1.8575129533678756, + "grad_norm": 46.42282973245658, + "learning_rate": 5.748600106833735e-07, + "loss": 1.1553, + "step": 717 + }, + { + "epoch": 1.8601036269430051, + "grad_norm": 26.48910946182044, + "learning_rate": 5.546071815108845e-07, + "loss": 1.0704, + "step": 718 + }, + { + "epoch": 1.8626943005181347, + "grad_norm": 29.34093197155635, + "learning_rate": 5.347125259923491e-07, + "loss": 1.1, + "step": 719 + }, + { + "epoch": 1.8652849740932642, + "grad_norm": 24.689130499541356, + "learning_rate": 5.151764105766011e-07, + "loss": 1.067, + "step": 720 + }, + { + "epoch": 1.8678756476683938, + "grad_norm": 21.25619644617847, + "learning_rate": 4.959991951083498e-07, + "loss": 1.1125, + "step": 721 + }, + { + "epoch": 1.8704663212435233, + "grad_norm": 23.946272802272112, + "learning_rate": 4.771812328215708e-07, + "loss": 1.0798, + "step": 722 + }, + { + "epoch": 1.8730569948186528, + "grad_norm": 33.286030816378954, + "learning_rate": 4.587228703329838e-07, + "loss": 1.0756, + "step": 723 + }, + { + "epoch": 1.8756476683937824, + "grad_norm": 109.02542545414109, + "learning_rate": 4.40624447635678e-07, + "loss": 1.073, + "step": 724 + }, + { + "epoch": 1.878238341968912, + "grad_norm": 133.80505789447585, + "learning_rate": 4.228862980928439e-07, + "loss": 1.1218, + "step": 725 + }, + { + "epoch": 1.8808290155440415, + "grad_norm": 28.671374209715793, + "learning_rate": 4.0550874843163337e-07, + "loss": 1.1546, + "step": 726 + }, + { + "epoch": 1.883419689119171, + "grad_norm": 20.092775273550536, + "learning_rate": 3.8849211873714266e-07, + "loss": 1.0608, + "step": 727 + }, + { + "epoch": 1.8860103626943006, + "grad_norm": 18.87195408427635, + "learning_rate": 3.7183672244652135e-07, + "loss": 1.0437, + "step": 728 + }, + { + "epoch": 1.88860103626943, + "grad_norm": 24.985644120932864, + "learning_rate": 3.5554286634318814e-07, + "loss": 1.0989, + "step": 729 + }, + { + "epoch": 1.8911917098445596, + "grad_norm": 24.09887960702925, + "learning_rate": 3.3961085055119083e-07, + "loss": 1.0347, + "step": 730 + }, + { + "epoch": 1.8937823834196892, + "grad_norm": 98.50926523613283, + "learning_rate": 3.2404096852967305e-07, + "loss": 1.1163, + "step": 731 + }, + { + "epoch": 1.8963730569948187, + "grad_norm": 42.45357973111845, + "learning_rate": 3.0883350706746973e-07, + "loss": 1.1497, + "step": 732 + }, + { + "epoch": 1.8989637305699483, + "grad_norm": 25.430184794482617, + "learning_rate": 2.9398874627782014e-07, + "loss": 1.0154, + "step": 733 + }, + { + "epoch": 1.9015544041450778, + "grad_norm": 32.56552224066898, + "learning_rate": 2.7950695959322093e-07, + "loss": 1.0976, + "step": 734 + }, + { + "epoch": 1.9041450777202074, + "grad_norm": 25.518391980867197, + "learning_rate": 2.653884137603702e-07, + "loss": 1.1122, + "step": 735 + }, + { + "epoch": 1.906735751295337, + "grad_norm": 20.537146853099735, + "learning_rate": 2.516333688352801e-07, + "loss": 1.0592, + "step": 736 + }, + { + "epoch": 1.9093264248704664, + "grad_norm": 25.28898033119641, + "learning_rate": 2.382420781784589e-07, + "loss": 1.0706, + "step": 737 + }, + { + "epoch": 1.911917098445596, + "grad_norm": 55.74230904177274, + "learning_rate": 2.2521478845025867e-07, + "loss": 1.1706, + "step": 738 + }, + { + "epoch": 1.9145077720207255, + "grad_norm": 42.768439146141375, + "learning_rate": 2.1255173960634146e-07, + "loss": 1.0917, + "step": 739 + }, + { + "epoch": 1.917098445595855, + "grad_norm": 31.627146067352545, + "learning_rate": 2.0025316489323597e-07, + "loss": 1.0842, + "step": 740 + }, + { + "epoch": 1.9196891191709846, + "grad_norm": 67.01614151937272, + "learning_rate": 1.8831929084406119e-07, + "loss": 1.1287, + "step": 741 + }, + { + "epoch": 1.922279792746114, + "grad_norm": 56.931018082229045, + "learning_rate": 1.7675033727434288e-07, + "loss": 1.148, + "step": 742 + }, + { + "epoch": 1.9248704663212435, + "grad_norm": 35.24107640275113, + "learning_rate": 1.655465172779702e-07, + "loss": 1.0814, + "step": 743 + }, + { + "epoch": 1.927461139896373, + "grad_norm": 28.45308969334642, + "learning_rate": 1.547080372232679e-07, + "loss": 1.1092, + "step": 744 + }, + { + "epoch": 1.9300518134715026, + "grad_norm": 67.36918357149847, + "learning_rate": 1.44235096749199e-07, + "loss": 1.1332, + "step": 745 + }, + { + "epoch": 1.932642487046632, + "grad_norm": 33.50866269131509, + "learning_rate": 1.3412788876167925e-07, + "loss": 1.0884, + "step": 746 + }, + { + "epoch": 1.9352331606217616, + "grad_norm": 34.359505767271465, + "learning_rate": 1.2438659943003306e-07, + "loss": 0.9982, + "step": 747 + }, + { + "epoch": 1.9378238341968912, + "grad_norm": 44.805290236152125, + "learning_rate": 1.1501140818355627e-07, + "loss": 1.065, + "step": 748 + }, + { + "epoch": 1.9404145077720207, + "grad_norm": 35.70322964853727, + "learning_rate": 1.0600248770821886e-07, + "loss": 1.1435, + "step": 749 + }, + { + "epoch": 1.9430051813471503, + "grad_norm": 37.7037381444634, + "learning_rate": 9.736000394348299e-08, + "loss": 1.1085, + "step": 750 + }, + { + "epoch": 1.9455958549222798, + "grad_norm": 19.88028370873119, + "learning_rate": 8.908411607923884e-08, + "loss": 1.0903, + "step": 751 + }, + { + "epoch": 1.9481865284974094, + "grad_norm": 22.037441897095253, + "learning_rate": 8.117497655287798e-08, + "loss": 1.0621, + "step": 752 + }, + { + "epoch": 1.950777202072539, + "grad_norm": 36.597366625713235, + "learning_rate": 7.363273104648904e-08, + "loss": 1.134, + "step": 753 + }, + { + "epoch": 1.9533678756476682, + "grad_norm": 36.91544331752125, + "learning_rate": 6.645751848417093e-08, + "loss": 1.0894, + "step": 754 + }, + { + "epoch": 1.9559585492227978, + "grad_norm": 30.791496804716704, + "learning_rate": 5.964947102946594e-08, + "loss": 1.0774, + "step": 755 + }, + { + "epoch": 1.9585492227979273, + "grad_norm": 24.76204564200231, + "learning_rate": 5.320871408294403e-08, + "loss": 1.1167, + "step": 756 + }, + { + "epoch": 1.9611398963730569, + "grad_norm": 31.78111531944549, + "learning_rate": 4.713536627987347e-08, + "loss": 1.0709, + "step": 757 + }, + { + "epoch": 1.9637305699481864, + "grad_norm": 36.388018093644106, + "learning_rate": 4.1429539488047066e-08, + "loss": 1.0492, + "step": 758 + }, + { + "epoch": 1.966321243523316, + "grad_norm": 27.235358627643226, + "learning_rate": 3.6091338805719356e-08, + "loss": 1.1128, + "step": 759 + }, + { + "epoch": 1.9689119170984455, + "grad_norm": 26.526882273916378, + "learning_rate": 3.1120862559670396e-08, + "loss": 1.1129, + "step": 760 + }, + { + "epoch": 1.971502590673575, + "grad_norm": 28.962449597773997, + "learning_rate": 2.651820230338942e-08, + "loss": 1.1286, + "step": 761 + }, + { + "epoch": 1.9740932642487046, + "grad_norm": 104.33848533313731, + "learning_rate": 2.2283442815402845e-08, + "loss": 1.117, + "step": 762 + }, + { + "epoch": 1.9766839378238341, + "grad_norm": 179.66099272542536, + "learning_rate": 1.8416662097693326e-08, + "loss": 1.0788, + "step": 763 + }, + { + "epoch": 1.9792746113989637, + "grad_norm": 28.438877123785307, + "learning_rate": 1.491793137427866e-08, + "loss": 1.1436, + "step": 764 + }, + { + "epoch": 1.9818652849740932, + "grad_norm": 44.454308819411644, + "learning_rate": 1.1787315089895057e-08, + "loss": 1.1108, + "step": 765 + }, + { + "epoch": 1.9844559585492227, + "grad_norm": 53.23249975862293, + "learning_rate": 9.024870908802552e-09, + "loss": 0.9971, + "step": 766 + }, + { + "epoch": 1.9870466321243523, + "grad_norm": 35.2043549019015, + "learning_rate": 6.630649713739168e-09, + "loss": 1.1205, + "step": 767 + }, + { + "epoch": 1.9896373056994818, + "grad_norm": 22.286284343829376, + "learning_rate": 4.6046956049639045e-09, + "loss": 1.0848, + "step": 768 + }, + { + "epoch": 1.9922279792746114, + "grad_norm": 24.94719200433733, + "learning_rate": 2.94704589946182e-09, + "loss": 1.1308, + "step": 769 + }, + { + "epoch": 1.994818652849741, + "grad_norm": 41.684623957583106, + "learning_rate": 1.657731130246809e-09, + "loss": 1.1555, + "step": 770 + }, + { + "epoch": 1.9974093264248705, + "grad_norm": 55.480495348949425, + "learning_rate": 7.367750458020518e-10, + "loss": 1.129, + "step": 771 + }, + { + "epoch": 2.0, + "grad_norm": 43.2148652279276, + "learning_rate": 1.8419460964258505e-10, + "loss": 1.0835, + "step": 772 + } + ], + "logging_steps": 1, + "max_steps": 772, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 193, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3363988309999616e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-772/training_args.bin b/checkpoint-772/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a4d661b15e5bbd8390fd11a502bea76680041301 --- /dev/null +++ b/checkpoint-772/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fe76c44cf1ade69372a2b861f80cfcfc5ba88f283683f660a4a0605f642aee3 +size 8568 diff --git a/checkpoint-772/zero_to_fp32.py b/checkpoint-772/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-772/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-822/README.md b/checkpoint-822/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5d958b0327f8e51ca223e270eb789387f6b41fb2 --- /dev/null +++ b/checkpoint-822/README.md @@ -0,0 +1,202 @@ +--- +base_model: THUDM/GLM-4-32B-0414 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.1 \ No newline at end of file diff --git a/checkpoint-822/adapter_config.json b/checkpoint-822/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d23c5bb0164ae65157b73dbb2e6dc419d09b28ad --- /dev/null +++ b/checkpoint-822/adapter_config.json @@ -0,0 +1,41 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "THUDM/GLM-4-32B-0414", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "down_proj", + "q_proj", + "k_proj", + "gate_up_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": true +} \ No newline at end of file diff --git a/checkpoint-822/adapter_model.safetensors b/checkpoint-822/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3bfc934021ae2f94535e9442dcecf9427f7b12c1 --- /dev/null +++ b/checkpoint-822/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9dabe0dcb2a00ba6eca0b1e4fb714d3c1d5289929ed928c9ab44c923fdb4073 +size 5579575888 diff --git a/checkpoint-822/global_step821/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-822/global_step821/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..16815fd557595e661dab5a16408d01d8bc738a5d --- /dev/null +++ b/checkpoint-822/global_step821/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff8217124921ff5a249c3953fe8750c111f39ee584b057ac5596ebc7e42b122e +size 2458601314 diff --git a/checkpoint-822/global_step821/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-822/global_step821/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..41058968a54c3bfd5e358e754073c22bf7811ff8 --- /dev/null +++ b/checkpoint-822/global_step821/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf46bfcdeced91f17b777758b5806c22a3f781b6a7ae5b7600171774a7671fc5 +size 2458601314 diff --git a/checkpoint-822/global_step821/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-822/global_step821/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5132756409fd067405e321b7018532c627b38684 --- /dev/null +++ b/checkpoint-822/global_step821/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57fd306863f4304d57f9023d7c32314dfc0b620cc0c6367bc8e1d9e7fb11a012 +size 2458601314 diff --git a/checkpoint-822/global_step821/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-822/global_step821/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..216308849092812f84d7046ad32eed4104b8bf54 --- /dev/null +++ b/checkpoint-822/global_step821/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de80e3a4e60771fb87094ed8cf54a31277bd49cffc9f7b584ed6644528236371 +size 2458601314 diff --git a/checkpoint-822/global_step821/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-822/global_step821/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..49a891d22c90cba7425954eb0407bdd1c4efb3ed --- /dev/null +++ b/checkpoint-822/global_step821/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a3634a9b0e7b5b158ebda08c1170da9c1fe2faa98325ee847cf175b72e68905 +size 2458601314 diff --git a/checkpoint-822/global_step821/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-822/global_step821/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..398d63e074368586e5a07b2572328a4346163c60 --- /dev/null +++ b/checkpoint-822/global_step821/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b70a9bfa4920e0ca156acfc71a9c65d73792cc1477fb0b834a9d0c64a01a33f +size 2458601314 diff --git a/checkpoint-822/global_step821/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-822/global_step821/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ba7ec2e2194d6cc8ec13f75c9675a133cbc51ff --- /dev/null +++ b/checkpoint-822/global_step821/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d15c17bc85a35923d0fd407b4b7284d4bc536654f79f4f615608e0bf68bb3232 +size 2458601314 diff --git a/checkpoint-822/global_step821/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-822/global_step821/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cce7869ee261dfb20609964701f785355c18595 --- /dev/null +++ b/checkpoint-822/global_step821/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c6a26561aa408977821f667dd9ffad58042de3c5a3bd8755c8b17d94d965be0 +size 2458601314 diff --git a/checkpoint-822/global_step821/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-822/global_step821/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f490f3e848c1bf755364ff89f7c1f705f0eb805 --- /dev/null +++ b/checkpoint-822/global_step821/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a6e664a02794bb3bcea64d6dbccffbf8cd011135fd9d479bae3e940093260c0 +size 752148 diff --git a/checkpoint-822/global_step821/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-822/global_step821/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4fe52bc115942793e064cea7c14d3a518d57742c --- /dev/null +++ b/checkpoint-822/global_step821/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b804acd6b0587702229c9fc03ee6f33a832b75d651a3b9f36460a5d3bbc3327f +size 752148 diff --git a/checkpoint-822/global_step821/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-822/global_step821/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ceb5431be8f713f4fa27f3b49da1c4f23e238b04 --- /dev/null +++ b/checkpoint-822/global_step821/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9be64f1086cb24858a8cbe60b9c8ba1bc4cdb724cf76362b19153f20e4a0fad +size 752148 diff --git a/checkpoint-822/global_step821/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-822/global_step821/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..41ec666967258ddb9184466e72490b7098db6aa5 --- /dev/null +++ b/checkpoint-822/global_step821/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b5e59dddf8daf3b5da78b90796d7c60a64dd05d09af6116b6a4c508fefe3260 +size 752148 diff --git a/checkpoint-822/global_step821/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-822/global_step821/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..befdbc0e0502acc54b204842fbc529601551e5ad --- /dev/null +++ b/checkpoint-822/global_step821/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:268c1f89f4fc75e297f0d683c7312808815c037812840c115152cc9b4646595a +size 752148 diff --git a/checkpoint-822/global_step821/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-822/global_step821/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..969e10f9bda06b782290e0a3530a15d9fc623a75 --- /dev/null +++ b/checkpoint-822/global_step821/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb39d1703bc2a45f71137c7a48ad375b703b40ad9a3e1ea40bbc117d0592506b +size 752148 diff --git a/checkpoint-822/global_step821/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-822/global_step821/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6be1bd2852206278b51a2582ca948997f5139731 --- /dev/null +++ b/checkpoint-822/global_step821/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8166dd6c0a2182a927ac738f3317f53998929e19a9024662cc4057587ddb0e4b +size 752148 diff --git a/checkpoint-822/global_step821/zero_pp_rank_7_mp_rank_00_model_states.pt b/checkpoint-822/global_step821/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c3ebfbba87842a3e5a237af7da03b24c0efaa0c --- /dev/null +++ b/checkpoint-822/global_step821/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69a9e5eca51799f601e4db178a891384f5f49b72eb7dd3e3123ff45eb56ca4e6 +size 752148 diff --git a/checkpoint-822/latest b/checkpoint-822/latest new file mode 100644 index 0000000000000000000000000000000000000000..3159aab1f7bb3903604150491f83c05295b87c00 --- /dev/null +++ b/checkpoint-822/latest @@ -0,0 +1 @@ +global_step821 \ No newline at end of file diff --git a/checkpoint-822/rng_state_0.pth b/checkpoint-822/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..008a6bab3696310472a1afaaf67aadd849da50c3 --- /dev/null +++ b/checkpoint-822/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bb5fed332be2363e0d622c76a17b7a5b6d05bf89825570682adb3cce5ac3b32 +size 15984 diff --git a/checkpoint-822/rng_state_1.pth b/checkpoint-822/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..a4cdf0595ae08ff971ae10aac00157c9ab410833 --- /dev/null +++ b/checkpoint-822/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9036603899dce8aed76bec4fedbc4a938c7ff8c25747841b38a8a6985bcc5258 +size 15984 diff --git a/checkpoint-822/rng_state_2.pth b/checkpoint-822/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..b7ce057356bc6924a6de2ba333e030246eb0ec97 --- /dev/null +++ b/checkpoint-822/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e5383ee48caba99966f39ef74c58fd9b753b4e81b93e096480e12713d196444 +size 15984 diff --git a/checkpoint-822/rng_state_3.pth b/checkpoint-822/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..5cb528dc950ed9161e91a8f0144b8f29af4452e7 --- /dev/null +++ b/checkpoint-822/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf7fcbf4184b00ba751039333e9c778fd6d6248e42b7de7962bbaa421f2a9f01 +size 15984 diff --git a/checkpoint-822/rng_state_4.pth b/checkpoint-822/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..56ac3edfc264abf4bb62dbeaa93082b3eb8754f7 --- /dev/null +++ b/checkpoint-822/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74b113cac9bc45f3a2b939d24f8bbcd4dd6e88d64c9d08763b93514f25d07726 +size 15984 diff --git a/checkpoint-822/rng_state_5.pth b/checkpoint-822/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..11e5c63d3af4f1835f3ae4aa32a10fd4d1678d42 --- /dev/null +++ b/checkpoint-822/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:561dc72467b5f7ee784383e8f29005c89d31198021d0fbe8f7ccb3ccec775670 +size 15984 diff --git a/checkpoint-822/rng_state_6.pth b/checkpoint-822/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..77436a3257a4c1cb7e32859741e535765b91a0e1 --- /dev/null +++ b/checkpoint-822/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80dcc08478ae8fa87319934fd245ff2b4e3e9e1aa8cc251bae816273cf2590cf +size 15984 diff --git a/checkpoint-822/rng_state_7.pth b/checkpoint-822/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..dfea8a63a18beab5870a1386ac65e6eb1fe78182 --- /dev/null +++ b/checkpoint-822/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09de15462c487019f9e0e4a3deee385b63c3fbfd9825baa43d347d9967f6f507 +size 15984 diff --git a/checkpoint-822/scheduler.pt b/checkpoint-822/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b3727e665d98d9888e069fffbfda0cbe4b2913a --- /dev/null +++ b/checkpoint-822/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8f38eb6ccddcf04fc28a6ceca8a53e7217e0aa0d7768e55e066d15d6b242cd3 +size 1064 diff --git a/checkpoint-822/special_tokens_map.json b/checkpoint-822/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3cf953c2c8a3c89778c92e54c685942bb1130616 --- /dev/null +++ b/checkpoint-822/special_tokens_map.json @@ -0,0 +1,32 @@ +{ + "additional_special_tokens": [ + "<|endoftext|>", + "[MASK]", + "[gMASK]", + "[sMASK]", + "", + "", + "<|system|>", + "<|user|>", + "<|assistant|>", + "<|observation|>", + "<|begin_of_image|>", + "<|end_of_image|>", + "<|begin_of_video|>", + "<|end_of_video|>" + ], + "eos_token": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-822/tokenizer.json b/checkpoint-822/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..4d1dde37a2715c11628fc84bf571976f9f80eb69 --- /dev/null +++ b/checkpoint-822/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76ebeac0d8bd7879ead7b43c16b44981f277e47225de2bd7de9ae1a6cc664a8c +size 19966496 diff --git a/checkpoint-822/tokenizer_config.json b/checkpoint-822/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e19c45f6310a17f6c1c6be76a429ac68438d547f --- /dev/null +++ b/checkpoint-822/tokenizer_config.json @@ -0,0 +1,146 @@ +{ + "added_tokens_decoder": { + "151329": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151330": { + "content": "[MASK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151331": { + "content": "[gMASK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151332": { + "content": "[sMASK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151333": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151334": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151335": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151336": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151337": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151338": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151339": { + "content": "<|begin_of_image|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151340": { + "content": "<|end_of_image|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151341": { + "content": "<|begin_of_video|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151342": { + "content": "<|end_of_video|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|endoftext|>", + "[MASK]", + "[gMASK]", + "[sMASK]", + "", + "", + "<|system|>", + "<|user|>", + "<|assistant|>", + "<|observation|>", + "<|begin_of_image|>", + "<|end_of_image|>", + "<|begin_of_video|>", + "<|end_of_video|>" + ], + "chat_template": "[gMASK]\n{%- if tools -%}\n<|system|>\n# 可用工具\n{% for tool in tools %}\n {%- set function = tool.function if tool.get(\"function\") else tool %}\n\n## {{ function.name }}\n\n{{ function | tojson(indent=4, ensure_ascii=False) }}\n在调用上述函数时,请使用 Json 格式表示调用的参数。\n{%- endfor %}\n{%- endif -%}\n\n{%- for msg in messages %}\n {%- if msg.role == 'system' %}\n<|system|>\n{{ msg.content }}\n {%- endif %}\n{%- endfor %}\n\n{%- for message in messages if message.role != 'system' %}\n {%- set role = message['role'] %}\n {%- set content = message['content'] %}\n {%- set meta = message.get(\"metadata\", \"\") %}\n\n {%- if role == 'user' %}\n<|user|>\n{{ content }}\n {%- elif role == 'assistant' and not meta %}\n<|assistant|>\n{{ content }}\n {%- elif role == 'assistant' and meta %}\n<|assistant|>{{ meta }}\n{{ content }}\n {%- elif role == 'observation' %}\n<|observation|>\n{{ content }}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}<|assistant|>{% endif %}", + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "eos_token": "<|user|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 128000, + "pad_token": "<|endoftext|>", + "padding_side": "left", + "remove_space": false, + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-822/trainer_state.json b/checkpoint-822/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2124d7628466454deb0df8507b054c6a0bbecab1 --- /dev/null +++ b/checkpoint-822/trainer_state.json @@ -0,0 +1,5852 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9963547995139734, + "eval_steps": 103, + "global_step": 822, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002430133657351154, + "grad_norm": 715.4923219036787, + "learning_rate": 0.0, + "loss": 1.3541, + "step": 1 + }, + { + "epoch": 0.002430133657351154, + "eval_loss": 1.3335719108581543, + "eval_runtime": 53.4883, + "eval_samples_per_second": 13.91, + "eval_steps_per_second": 1.739, + "step": 1 + }, + { + "epoch": 0.004860267314702308, + "grad_norm": 614.6970578314867, + "learning_rate": 5e-06, + "loss": 1.3775, + "step": 2 + }, + { + "epoch": 0.007290400972053463, + "grad_norm": 471.59017991123795, + "learning_rate": 1e-05, + "loss": 1.339, + "step": 3 + }, + { + "epoch": 0.009720534629404616, + "grad_norm": 238.72216262259653, + "learning_rate": 1.5e-05, + "loss": 1.3829, + "step": 4 + }, + { + "epoch": 0.012150668286755772, + "grad_norm": 355.68955726709873, + "learning_rate": 2e-05, + "loss": 1.3597, + "step": 5 + }, + { + "epoch": 0.014580801944106925, + "grad_norm": 414.5627284272111, + "learning_rate": 2.5e-05, + "loss": 1.3862, + "step": 6 + }, + { + "epoch": 0.01701093560145808, + "grad_norm": 534.9877222052693, + "learning_rate": 3e-05, + "loss": 1.2784, + "step": 7 + }, + { + "epoch": 0.019441069258809233, + "grad_norm": 153.38895635666677, + "learning_rate": 3.5e-05, + "loss": 1.3521, + "step": 8 + }, + { + "epoch": 0.02187120291616039, + "grad_norm": 858.293734138087, + "learning_rate": 4e-05, + "loss": 1.2461, + "step": 9 + }, + { + "epoch": 0.024301336573511544, + "grad_norm": 255.81989388533376, + "learning_rate": 4.5e-05, + "loss": 1.2778, + "step": 10 + }, + { + "epoch": 0.026731470230862697, + "grad_norm": 368.91949003479226, + "learning_rate": 5e-05, + "loss": 1.3412, + "step": 11 + }, + { + "epoch": 0.02916160388821385, + "grad_norm": 176.49481799555898, + "learning_rate": 5.500000000000001e-05, + "loss": 1.3437, + "step": 12 + }, + { + "epoch": 0.031591737545565005, + "grad_norm": 208.57742104974147, + "learning_rate": 6e-05, + "loss": 1.2859, + "step": 13 + }, + { + "epoch": 0.03402187120291616, + "grad_norm": 93.26742036471734, + "learning_rate": 6.500000000000001e-05, + "loss": 1.1843, + "step": 14 + }, + { + "epoch": 0.03645200486026731, + "grad_norm": 145.53380444622215, + "learning_rate": 7e-05, + "loss": 1.4281, + "step": 15 + }, + { + "epoch": 0.038882138517618466, + "grad_norm": 126.56724937430516, + "learning_rate": 7.500000000000001e-05, + "loss": 1.3908, + "step": 16 + }, + { + "epoch": 0.041312272174969626, + "grad_norm": 106.19246390662754, + "learning_rate": 8e-05, + "loss": 1.344, + "step": 17 + }, + { + "epoch": 0.04374240583232078, + "grad_norm": 289.348178084847, + "learning_rate": 8.5e-05, + "loss": 1.2708, + "step": 18 + }, + { + "epoch": 0.046172539489671933, + "grad_norm": 286.63676887065634, + "learning_rate": 9e-05, + "loss": 1.3564, + "step": 19 + }, + { + "epoch": 0.04860267314702309, + "grad_norm": 269.6096299101413, + "learning_rate": 9.5e-05, + "loss": 1.2184, + "step": 20 + }, + { + "epoch": 0.05103280680437424, + "grad_norm": 151.28678796160915, + "learning_rate": 0.0001, + "loss": 1.2974, + "step": 21 + }, + { + "epoch": 0.053462940461725394, + "grad_norm": 265.5625538646362, + "learning_rate": 0.000105, + "loss": 1.2703, + "step": 22 + }, + { + "epoch": 0.05589307411907655, + "grad_norm": 724.7157187586193, + "learning_rate": 0.00011000000000000002, + "loss": 1.2691, + "step": 23 + }, + { + "epoch": 0.0583232077764277, + "grad_norm": 425.3768239347252, + "learning_rate": 0.00011499999999999999, + "loss": 1.375, + "step": 24 + }, + { + "epoch": 0.060753341433778855, + "grad_norm": 314.5119318308783, + "learning_rate": 0.00012, + "loss": 1.2952, + "step": 25 + }, + { + "epoch": 0.06318347509113001, + "grad_norm": 557.519173033834, + "learning_rate": 0.000125, + "loss": 1.2923, + "step": 26 + }, + { + "epoch": 0.06561360874848117, + "grad_norm": 211.4069356529637, + "learning_rate": 0.00013000000000000002, + "loss": 1.2629, + "step": 27 + }, + { + "epoch": 0.06804374240583232, + "grad_norm": 299.7742653722713, + "learning_rate": 0.00013500000000000003, + "loss": 1.3099, + "step": 28 + }, + { + "epoch": 0.07047387606318348, + "grad_norm": 182.18551965886013, + "learning_rate": 0.00014, + "loss": 1.2215, + "step": 29 + }, + { + "epoch": 0.07290400972053462, + "grad_norm": 153.38300520125887, + "learning_rate": 0.000145, + "loss": 1.2799, + "step": 30 + }, + { + "epoch": 0.07533414337788578, + "grad_norm": 849.4472853252786, + "learning_rate": 0.00015000000000000001, + "loss": 1.2012, + "step": 31 + }, + { + "epoch": 0.07776427703523693, + "grad_norm": 179.94814586965418, + "learning_rate": 0.000155, + "loss": 1.2103, + "step": 32 + }, + { + "epoch": 0.08019441069258809, + "grad_norm": 180.36681057956048, + "learning_rate": 0.00016, + "loss": 1.2414, + "step": 33 + }, + { + "epoch": 0.08262454434993925, + "grad_norm": 113.72852454032189, + "learning_rate": 0.000165, + "loss": 1.2508, + "step": 34 + }, + { + "epoch": 0.0850546780072904, + "grad_norm": 150.53415363213057, + "learning_rate": 0.00017, + "loss": 1.2528, + "step": 35 + }, + { + "epoch": 0.08748481166464156, + "grad_norm": 156.19567878683574, + "learning_rate": 0.000175, + "loss": 1.2016, + "step": 36 + }, + { + "epoch": 0.0899149453219927, + "grad_norm": 416.34884765145057, + "learning_rate": 0.00018, + "loss": 1.254, + "step": 37 + }, + { + "epoch": 0.09234507897934387, + "grad_norm": 269.7105025581372, + "learning_rate": 0.00018500000000000002, + "loss": 1.2215, + "step": 38 + }, + { + "epoch": 0.09477521263669501, + "grad_norm": 249.35069047655023, + "learning_rate": 0.00019, + "loss": 1.2078, + "step": 39 + }, + { + "epoch": 0.09720534629404617, + "grad_norm": 167.16896045613478, + "learning_rate": 0.000195, + "loss": 1.1866, + "step": 40 + }, + { + "epoch": 0.09963547995139732, + "grad_norm": 248.22240554128427, + "learning_rate": 0.0002, + "loss": 1.252, + "step": 41 + }, + { + "epoch": 0.10206561360874848, + "grad_norm": 180.89520841022969, + "learning_rate": 0.0001999991930332148, + "loss": 1.2251, + "step": 42 + }, + { + "epoch": 0.10449574726609964, + "grad_norm": 614.4291375430485, + "learning_rate": 0.00019999677214588312, + "loss": 1.2563, + "step": 43 + }, + { + "epoch": 0.10692588092345079, + "grad_norm": 211.7523427355369, + "learning_rate": 0.00019999273737707646, + "loss": 1.193, + "step": 44 + }, + { + "epoch": 0.10935601458080195, + "grad_norm": 181.56788458769344, + "learning_rate": 0.00019998708879191335, + "loss": 1.2598, + "step": 45 + }, + { + "epoch": 0.1117861482381531, + "grad_norm": 157.5783414916277, + "learning_rate": 0.00019997982648155814, + "loss": 1.2663, + "step": 46 + }, + { + "epoch": 0.11421628189550426, + "grad_norm": 155.78006251192625, + "learning_rate": 0.00019997095056321971, + "loss": 1.1637, + "step": 47 + }, + { + "epoch": 0.1166464155528554, + "grad_norm": 202.0253360488958, + "learning_rate": 0.00019996046118014955, + "loss": 1.2508, + "step": 48 + }, + { + "epoch": 0.11907654921020656, + "grad_norm": 192.7576297264874, + "learning_rate": 0.00019994835850163924, + "loss": 1.2014, + "step": 49 + }, + { + "epoch": 0.12150668286755771, + "grad_norm": 132.5484871621418, + "learning_rate": 0.00019993464272301804, + "loss": 1.2279, + "step": 50 + }, + { + "epoch": 0.12393681652490887, + "grad_norm": 128.32285438248965, + "learning_rate": 0.00019991931406564944, + "loss": 1.2179, + "step": 51 + }, + { + "epoch": 0.12636695018226002, + "grad_norm": 552.3669463716512, + "learning_rate": 0.00019990237277692788, + "loss": 1.1498, + "step": 52 + }, + { + "epoch": 0.12879708383961117, + "grad_norm": 86.17911790260192, + "learning_rate": 0.00019988381913027442, + "loss": 1.2784, + "step": 53 + }, + { + "epoch": 0.13122721749696234, + "grad_norm": 70.83294605515782, + "learning_rate": 0.00019986365342513265, + "loss": 1.2224, + "step": 54 + }, + { + "epoch": 0.1336573511543135, + "grad_norm": 45.23624563299466, + "learning_rate": 0.00019984187598696363, + "loss": 1.1746, + "step": 55 + }, + { + "epoch": 0.13608748481166463, + "grad_norm": 57.67645735585192, + "learning_rate": 0.00019981848716724073, + "loss": 1.2154, + "step": 56 + }, + { + "epoch": 0.1385176184690158, + "grad_norm": 45.661268047129674, + "learning_rate": 0.00019979348734344398, + "loss": 1.1411, + "step": 57 + }, + { + "epoch": 0.14094775212636695, + "grad_norm": 53.10628399970359, + "learning_rate": 0.00019976687691905393, + "loss": 1.2029, + "step": 58 + }, + { + "epoch": 0.1433778857837181, + "grad_norm": 38.71353325803162, + "learning_rate": 0.00019973865632354516, + "loss": 1.1976, + "step": 59 + }, + { + "epoch": 0.14580801944106925, + "grad_norm": 42.789208063581114, + "learning_rate": 0.0001997088260123793, + "loss": 1.1477, + "step": 60 + }, + { + "epoch": 0.14823815309842042, + "grad_norm": 37.613194740192164, + "learning_rate": 0.0001996773864669978, + "loss": 1.2529, + "step": 61 + }, + { + "epoch": 0.15066828675577157, + "grad_norm": 47.96813084127655, + "learning_rate": 0.00019964433819481405, + "loss": 1.2328, + "step": 62 + }, + { + "epoch": 0.15309842041312272, + "grad_norm": 55.30483872428545, + "learning_rate": 0.00019960968172920516, + "loss": 1.1996, + "step": 63 + }, + { + "epoch": 0.15552855407047386, + "grad_norm": 35.58995799070749, + "learning_rate": 0.00019957341762950344, + "loss": 1.1248, + "step": 64 + }, + { + "epoch": 0.15795868772782504, + "grad_norm": 58.86131222300149, + "learning_rate": 0.00019953554648098748, + "loss": 1.3017, + "step": 65 + }, + { + "epoch": 0.16038882138517618, + "grad_norm": 32.12091331878439, + "learning_rate": 0.00019949606889487233, + "loss": 1.1961, + "step": 66 + }, + { + "epoch": 0.16281895504252733, + "grad_norm": 167.27433996357928, + "learning_rate": 0.0001994549855083001, + "loss": 1.1768, + "step": 67 + }, + { + "epoch": 0.1652490886998785, + "grad_norm": 32.3328494297432, + "learning_rate": 0.0001994122969843293, + "loss": 1.1802, + "step": 68 + }, + { + "epoch": 0.16767922235722965, + "grad_norm": 39.92530074438497, + "learning_rate": 0.0001993680040119244, + "loss": 1.2098, + "step": 69 + }, + { + "epoch": 0.1701093560145808, + "grad_norm": 45.60830517129956, + "learning_rate": 0.0001993221073059445, + "loss": 1.2159, + "step": 70 + }, + { + "epoch": 0.17253948967193194, + "grad_norm": 35.462695032736335, + "learning_rate": 0.00019927460760713197, + "loss": 1.1818, + "step": 71 + }, + { + "epoch": 0.17496962332928312, + "grad_norm": 43.05751624597826, + "learning_rate": 0.0001992255056821004, + "loss": 1.2011, + "step": 72 + }, + { + "epoch": 0.17739975698663427, + "grad_norm": 47.13143404969894, + "learning_rate": 0.00019917480232332224, + "loss": 1.1669, + "step": 73 + }, + { + "epoch": 0.1798298906439854, + "grad_norm": 72.07146401418987, + "learning_rate": 0.000199122498349116, + "loss": 1.181, + "step": 74 + }, + { + "epoch": 0.1822600243013366, + "grad_norm": 36.289202348834955, + "learning_rate": 0.00019906859460363307, + "loss": 1.1787, + "step": 75 + }, + { + "epoch": 0.18469015795868773, + "grad_norm": 46.92636167228936, + "learning_rate": 0.00019901309195684416, + "loss": 1.2316, + "step": 76 + }, + { + "epoch": 0.18712029161603888, + "grad_norm": 31.71425340357504, + "learning_rate": 0.00019895599130452505, + "loss": 1.1607, + "step": 77 + }, + { + "epoch": 0.18955042527339003, + "grad_norm": 43.94199928621344, + "learning_rate": 0.00019889729356824235, + "loss": 1.1919, + "step": 78 + }, + { + "epoch": 0.1919805589307412, + "grad_norm": 45.33073791860179, + "learning_rate": 0.0001988369996953386, + "loss": 1.2237, + "step": 79 + }, + { + "epoch": 0.19441069258809235, + "grad_norm": 135.89980489661897, + "learning_rate": 0.00019877511065891673, + "loss": 1.1822, + "step": 80 + }, + { + "epoch": 0.1968408262454435, + "grad_norm": 439.6770852212966, + "learning_rate": 0.00019871162745782478, + "loss": 1.1441, + "step": 81 + }, + { + "epoch": 0.19927095990279464, + "grad_norm": 80.73319798776026, + "learning_rate": 0.0001986465511166394, + "loss": 1.1709, + "step": 82 + }, + { + "epoch": 0.20170109356014582, + "grad_norm": 87.76515297497458, + "learning_rate": 0.00019857988268564953, + "loss": 1.1549, + "step": 83 + }, + { + "epoch": 0.20413122721749696, + "grad_norm": 70.08754986406095, + "learning_rate": 0.00019851162324083932, + "loss": 1.1771, + "step": 84 + }, + { + "epoch": 0.2065613608748481, + "grad_norm": 187.8198997057664, + "learning_rate": 0.0001984417738838709, + "loss": 1.2068, + "step": 85 + }, + { + "epoch": 0.20899149453219928, + "grad_norm": 127.78818684755072, + "learning_rate": 0.00019837033574206646, + "loss": 1.1974, + "step": 86 + }, + { + "epoch": 0.21142162818955043, + "grad_norm": 127.82979216871074, + "learning_rate": 0.0001982973099683902, + "loss": 1.185, + "step": 87 + }, + { + "epoch": 0.21385176184690158, + "grad_norm": 142.35425084857746, + "learning_rate": 0.00019822269774142954, + "loss": 1.2225, + "step": 88 + }, + { + "epoch": 0.21628189550425272, + "grad_norm": 246.64019353564817, + "learning_rate": 0.0001981465002653763, + "loss": 1.2574, + "step": 89 + }, + { + "epoch": 0.2187120291616039, + "grad_norm": 189.88471076285524, + "learning_rate": 0.0001980687187700071, + "loss": 1.1635, + "step": 90 + }, + { + "epoch": 0.22114216281895505, + "grad_norm": 116.65693373141701, + "learning_rate": 0.00019798935451066361, + "loss": 1.1457, + "step": 91 + }, + { + "epoch": 0.2235722964763062, + "grad_norm": 71.76422539970217, + "learning_rate": 0.00019790840876823232, + "loss": 1.2354, + "step": 92 + }, + { + "epoch": 0.22600243013365734, + "grad_norm": 139.42330509386431, + "learning_rate": 0.0001978258828491236, + "loss": 1.18, + "step": 93 + }, + { + "epoch": 0.2284325637910085, + "grad_norm": 131.88308820601443, + "learning_rate": 0.00019774177808525113, + "loss": 1.1868, + "step": 94 + }, + { + "epoch": 0.23086269744835966, + "grad_norm": 85.81071125615291, + "learning_rate": 0.00019765609583400977, + "loss": 1.1814, + "step": 95 + }, + { + "epoch": 0.2332928311057108, + "grad_norm": 84.43756298541064, + "learning_rate": 0.00019756883747825424, + "loss": 1.1658, + "step": 96 + }, + { + "epoch": 0.23572296476306198, + "grad_norm": 114.24245545143974, + "learning_rate": 0.0001974800044262764, + "loss": 1.2497, + "step": 97 + }, + { + "epoch": 0.23815309842041313, + "grad_norm": 76.577511222722, + "learning_rate": 0.00019738959811178272, + "loss": 1.1414, + "step": 98 + }, + { + "epoch": 0.24058323207776428, + "grad_norm": 171.8084830895381, + "learning_rate": 0.00019729761999387103, + "loss": 1.1619, + "step": 99 + }, + { + "epoch": 0.24301336573511542, + "grad_norm": 221.87752250936416, + "learning_rate": 0.00019720407155700707, + "loss": 1.2718, + "step": 100 + }, + { + "epoch": 0.2454434993924666, + "grad_norm": 205.64943975370608, + "learning_rate": 0.00019710895431100046, + "loss": 1.1786, + "step": 101 + }, + { + "epoch": 0.24787363304981774, + "grad_norm": 160.16582903260615, + "learning_rate": 0.00019701226979098037, + "loss": 1.1426, + "step": 102 + }, + { + "epoch": 0.2503037667071689, + "grad_norm": 82.85031394537334, + "learning_rate": 0.00019691401955737072, + "loss": 1.1718, + "step": 103 + }, + { + "epoch": 0.2503037667071689, + "eval_loss": 1.1633374691009521, + "eval_runtime": 52.6182, + "eval_samples_per_second": 14.14, + "eval_steps_per_second": 1.767, + "step": 103 + }, + { + "epoch": 0.25273390036452004, + "grad_norm": 94.74469296109082, + "learning_rate": 0.000196814205195865, + "loss": 1.2255, + "step": 104 + }, + { + "epoch": 0.2551640340218712, + "grad_norm": 126.15797466756656, + "learning_rate": 0.00019671282831740076, + "loss": 1.1623, + "step": 105 + }, + { + "epoch": 0.25759416767922233, + "grad_norm": 79.41156434272008, + "learning_rate": 0.0001966098905581334, + "loss": 1.1606, + "step": 106 + }, + { + "epoch": 0.2600243013365735, + "grad_norm": 70.33104031058372, + "learning_rate": 0.00019650539357941003, + "loss": 1.196, + "step": 107 + }, + { + "epoch": 0.2624544349939247, + "grad_norm": 69.57260733822498, + "learning_rate": 0.0001963993390677424, + "loss": 1.1939, + "step": 108 + }, + { + "epoch": 0.2648845686512758, + "grad_norm": 81.78820691772725, + "learning_rate": 0.00019629172873477995, + "loss": 1.2553, + "step": 109 + }, + { + "epoch": 0.267314702308627, + "grad_norm": 117.06324110268656, + "learning_rate": 0.00019618256431728194, + "loss": 1.2535, + "step": 110 + }, + { + "epoch": 0.26974483596597815, + "grad_norm": 83.26993317104247, + "learning_rate": 0.00019607184757708951, + "loss": 1.157, + "step": 111 + }, + { + "epoch": 0.27217496962332927, + "grad_norm": 51.990829456422375, + "learning_rate": 0.00019595958030109735, + "loss": 1.1274, + "step": 112 + }, + { + "epoch": 0.27460510328068044, + "grad_norm": 119.7487160875729, + "learning_rate": 0.00019584576430122473, + "loss": 1.1422, + "step": 113 + }, + { + "epoch": 0.2770352369380316, + "grad_norm": 88.15636932272304, + "learning_rate": 0.00019573040141438624, + "loss": 1.1599, + "step": 114 + }, + { + "epoch": 0.27946537059538273, + "grad_norm": 62.346402225534774, + "learning_rate": 0.00019561349350246226, + "loss": 1.1909, + "step": 115 + }, + { + "epoch": 0.2818955042527339, + "grad_norm": 76.40612150653034, + "learning_rate": 0.0001954950424522688, + "loss": 1.1646, + "step": 116 + }, + { + "epoch": 0.284325637910085, + "grad_norm": 94.8711613055073, + "learning_rate": 0.00019537505017552716, + "loss": 1.1547, + "step": 117 + }, + { + "epoch": 0.2867557715674362, + "grad_norm": 63.86961661796314, + "learning_rate": 0.00019525351860883293, + "loss": 1.1841, + "step": 118 + }, + { + "epoch": 0.2891859052247874, + "grad_norm": 133.2417924150684, + "learning_rate": 0.00019513044971362494, + "loss": 1.1365, + "step": 119 + }, + { + "epoch": 0.2916160388821385, + "grad_norm": 133.44891510996445, + "learning_rate": 0.00019500584547615333, + "loss": 1.1696, + "step": 120 + }, + { + "epoch": 0.29404617253948967, + "grad_norm": 58.51701768739601, + "learning_rate": 0.00019487970790744774, + "loss": 1.1874, + "step": 121 + }, + { + "epoch": 0.29647630619684084, + "grad_norm": 49.536158238056196, + "learning_rate": 0.00019475203904328474, + "loss": 1.1798, + "step": 122 + }, + { + "epoch": 0.29890643985419196, + "grad_norm": 94.27608706983857, + "learning_rate": 0.000194622840944155, + "loss": 1.2443, + "step": 123 + }, + { + "epoch": 0.30133657351154314, + "grad_norm": 103.868243202843, + "learning_rate": 0.00019449211569523, + "loss": 1.1759, + "step": 124 + }, + { + "epoch": 0.3037667071688943, + "grad_norm": 73.31536435980003, + "learning_rate": 0.00019435986540632843, + "loss": 1.1885, + "step": 125 + }, + { + "epoch": 0.30619684082624543, + "grad_norm": 64.91149114745738, + "learning_rate": 0.00019422609221188207, + "loss": 1.1864, + "step": 126 + }, + { + "epoch": 0.3086269744835966, + "grad_norm": 95.34449184763653, + "learning_rate": 0.00019409079827090145, + "loss": 1.1339, + "step": 127 + }, + { + "epoch": 0.3110571081409477, + "grad_norm": 67.36156159754226, + "learning_rate": 0.00019395398576694086, + "loss": 1.1845, + "step": 128 + }, + { + "epoch": 0.3134872417982989, + "grad_norm": 36.94913176821407, + "learning_rate": 0.00019381565690806328, + "loss": 1.2154, + "step": 129 + }, + { + "epoch": 0.3159173754556501, + "grad_norm": 69.05265214547647, + "learning_rate": 0.00019367581392680457, + "loss": 1.1642, + "step": 130 + }, + { + "epoch": 0.3183475091130012, + "grad_norm": 38.974761165559855, + "learning_rate": 0.00019353445908013755, + "loss": 1.1508, + "step": 131 + }, + { + "epoch": 0.32077764277035237, + "grad_norm": 48.47215142199794, + "learning_rate": 0.00019339159464943557, + "loss": 1.2011, + "step": 132 + }, + { + "epoch": 0.32320777642770354, + "grad_norm": 41.88512063342574, + "learning_rate": 0.00019324722294043558, + "loss": 1.1643, + "step": 133 + }, + { + "epoch": 0.32563791008505466, + "grad_norm": 25.59403215229145, + "learning_rate": 0.00019310134628320114, + "loss": 1.1954, + "step": 134 + }, + { + "epoch": 0.32806804374240583, + "grad_norm": 58.02634988046396, + "learning_rate": 0.00019295396703208453, + "loss": 1.1544, + "step": 135 + }, + { + "epoch": 0.330498177399757, + "grad_norm": 31.26218977398251, + "learning_rate": 0.00019280508756568896, + "loss": 1.1613, + "step": 136 + }, + { + "epoch": 0.33292831105710813, + "grad_norm": 31.81234539284103, + "learning_rate": 0.00019265471028683014, + "loss": 1.1892, + "step": 137 + }, + { + "epoch": 0.3353584447144593, + "grad_norm": 54.44930114675527, + "learning_rate": 0.00019250283762249748, + "loss": 1.2801, + "step": 138 + }, + { + "epoch": 0.3377885783718105, + "grad_norm": 30.320486287732734, + "learning_rate": 0.00019234947202381486, + "loss": 1.1934, + "step": 139 + }, + { + "epoch": 0.3402187120291616, + "grad_norm": 32.76175001943503, + "learning_rate": 0.00019219461596600113, + "loss": 1.1436, + "step": 140 + }, + { + "epoch": 0.34264884568651277, + "grad_norm": 36.802264122697316, + "learning_rate": 0.00019203827194833026, + "loss": 1.1418, + "step": 141 + }, + { + "epoch": 0.3450789793438639, + "grad_norm": 35.03898729580271, + "learning_rate": 0.0001918804424940908, + "loss": 1.2479, + "step": 142 + }, + { + "epoch": 0.34750911300121506, + "grad_norm": 89.58068030461165, + "learning_rate": 0.00019172113015054532, + "loss": 1.2504, + "step": 143 + }, + { + "epoch": 0.34993924665856624, + "grad_norm": 30.05799668441019, + "learning_rate": 0.00019156033748888917, + "loss": 1.1662, + "step": 144 + }, + { + "epoch": 0.35236938031591736, + "grad_norm": 33.80121199203598, + "learning_rate": 0.00019139806710420914, + "loss": 1.1862, + "step": 145 + }, + { + "epoch": 0.35479951397326853, + "grad_norm": 31.510896023067872, + "learning_rate": 0.00019123432161544142, + "loss": 1.147, + "step": 146 + }, + { + "epoch": 0.3572296476306197, + "grad_norm": 32.92613286618093, + "learning_rate": 0.00019106910366532942, + "loss": 1.1421, + "step": 147 + }, + { + "epoch": 0.3596597812879708, + "grad_norm": 245.36013493823395, + "learning_rate": 0.00019090241592038113, + "loss": 1.1306, + "step": 148 + }, + { + "epoch": 0.362089914945322, + "grad_norm": 72.3061625644275, + "learning_rate": 0.000190734261070826, + "loss": 1.1144, + "step": 149 + }, + { + "epoch": 0.3645200486026732, + "grad_norm": 63.77748866336388, + "learning_rate": 0.00019056464183057157, + "loss": 1.1249, + "step": 150 + }, + { + "epoch": 0.3669501822600243, + "grad_norm": 633.2421324308109, + "learning_rate": 0.00019039356093715975, + "loss": 1.1359, + "step": 151 + }, + { + "epoch": 0.36938031591737547, + "grad_norm": 34.456657555313704, + "learning_rate": 0.00019022102115172248, + "loss": 1.1397, + "step": 152 + }, + { + "epoch": 0.3718104495747266, + "grad_norm": 35.21328820959324, + "learning_rate": 0.00019004702525893732, + "loss": 1.1741, + "step": 153 + }, + { + "epoch": 0.37424058323207776, + "grad_norm": 90.32405227187036, + "learning_rate": 0.00018987157606698235, + "loss": 1.1844, + "step": 154 + }, + { + "epoch": 0.37667071688942894, + "grad_norm": 39.348755664527914, + "learning_rate": 0.000189694676407491, + "loss": 1.1216, + "step": 155 + }, + { + "epoch": 0.37910085054678005, + "grad_norm": 58.85540744859834, + "learning_rate": 0.00018951632913550626, + "loss": 1.115, + "step": 156 + }, + { + "epoch": 0.38153098420413123, + "grad_norm": 39.849945227365325, + "learning_rate": 0.0001893365371294346, + "loss": 1.1705, + "step": 157 + }, + { + "epoch": 0.3839611178614824, + "grad_norm": 40.300954908722304, + "learning_rate": 0.0001891553032909996, + "loss": 1.1831, + "step": 158 + }, + { + "epoch": 0.3863912515188335, + "grad_norm": 53.72009888405355, + "learning_rate": 0.00018897263054519498, + "loss": 1.1613, + "step": 159 + }, + { + "epoch": 0.3888213851761847, + "grad_norm": 142.22686975859034, + "learning_rate": 0.0001887885218402375, + "loss": 1.1639, + "step": 160 + }, + { + "epoch": 0.39125151883353587, + "grad_norm": 50.141889086717356, + "learning_rate": 0.00018860298014751944, + "loss": 1.1659, + "step": 161 + }, + { + "epoch": 0.393681652490887, + "grad_norm": 63.25519968311113, + "learning_rate": 0.0001884160084615604, + "loss": 1.168, + "step": 162 + }, + { + "epoch": 0.39611178614823817, + "grad_norm": 50.59325246324073, + "learning_rate": 0.0001882276097999592, + "loss": 1.1202, + "step": 163 + }, + { + "epoch": 0.3985419198055893, + "grad_norm": 58.32587879810431, + "learning_rate": 0.0001880377872033451, + "loss": 1.1587, + "step": 164 + }, + { + "epoch": 0.40097205346294046, + "grad_norm": 211.50882688314653, + "learning_rate": 0.00018784654373532866, + "loss": 1.1551, + "step": 165 + }, + { + "epoch": 0.40340218712029163, + "grad_norm": 47.82888424614203, + "learning_rate": 0.00018765388248245246, + "loss": 1.2274, + "step": 166 + }, + { + "epoch": 0.40583232077764275, + "grad_norm": 97.94922685274778, + "learning_rate": 0.00018745980655414114, + "loss": 1.0872, + "step": 167 + }, + { + "epoch": 0.4082624544349939, + "grad_norm": 44.74994721544976, + "learning_rate": 0.0001872643190826512, + "loss": 1.1244, + "step": 168 + }, + { + "epoch": 0.4106925880923451, + "grad_norm": 53.84692426866845, + "learning_rate": 0.00018706742322302064, + "loss": 1.1576, + "step": 169 + }, + { + "epoch": 0.4131227217496962, + "grad_norm": 54.43599132185614, + "learning_rate": 0.0001868691221530178, + "loss": 1.0957, + "step": 170 + }, + { + "epoch": 0.4155528554070474, + "grad_norm": 39.21766518089018, + "learning_rate": 0.00018666941907309026, + "loss": 1.1625, + "step": 171 + }, + { + "epoch": 0.41798298906439857, + "grad_norm": 49.40030697752548, + "learning_rate": 0.000186468317206313, + "loss": 1.1556, + "step": 172 + }, + { + "epoch": 0.4204131227217497, + "grad_norm": 101.50309647820374, + "learning_rate": 0.0001862658197983366, + "loss": 1.1687, + "step": 173 + }, + { + "epoch": 0.42284325637910086, + "grad_norm": 105.41233861946563, + "learning_rate": 0.0001860619301173347, + "loss": 1.1687, + "step": 174 + }, + { + "epoch": 0.425273390036452, + "grad_norm": 103.99749987770305, + "learning_rate": 0.0001858566514539513, + "loss": 1.144, + "step": 175 + }, + { + "epoch": 0.42770352369380316, + "grad_norm": 78.83490301242213, + "learning_rate": 0.0001856499871212477, + "loss": 1.2318, + "step": 176 + }, + { + "epoch": 0.43013365735115433, + "grad_norm": 62.325757489859335, + "learning_rate": 0.00018544194045464886, + "loss": 1.1092, + "step": 177 + }, + { + "epoch": 0.43256379100850545, + "grad_norm": 81.32804926878099, + "learning_rate": 0.00018523251481188986, + "loss": 1.2233, + "step": 178 + }, + { + "epoch": 0.4349939246658566, + "grad_norm": 38.97928032166606, + "learning_rate": 0.00018502171357296144, + "loss": 1.2371, + "step": 179 + }, + { + "epoch": 0.4374240583232078, + "grad_norm": 82.62345361244209, + "learning_rate": 0.0001848095401400555, + "loss": 1.1562, + "step": 180 + }, + { + "epoch": 0.4398541919805589, + "grad_norm": 47.793381366401626, + "learning_rate": 0.0001845959979375104, + "loss": 1.1249, + "step": 181 + }, + { + "epoch": 0.4422843256379101, + "grad_norm": 53.6022948471739, + "learning_rate": 0.00018438109041175532, + "loss": 1.1415, + "step": 182 + }, + { + "epoch": 0.44471445929526127, + "grad_norm": 65.92717051568573, + "learning_rate": 0.00018416482103125506, + "loss": 1.1748, + "step": 183 + }, + { + "epoch": 0.4471445929526124, + "grad_norm": 59.410481167619494, + "learning_rate": 0.0001839471932864537, + "loss": 1.1399, + "step": 184 + }, + { + "epoch": 0.44957472660996356, + "grad_norm": 64.22740395872977, + "learning_rate": 0.0001837282106897185, + "loss": 1.2193, + "step": 185 + }, + { + "epoch": 0.4520048602673147, + "grad_norm": 54.63497168787729, + "learning_rate": 0.00018350787677528306, + "loss": 1.153, + "step": 186 + }, + { + "epoch": 0.45443499392466585, + "grad_norm": 49.60676029637355, + "learning_rate": 0.00018328619509919044, + "loss": 1.1509, + "step": 187 + }, + { + "epoch": 0.456865127582017, + "grad_norm": 32.29074835877607, + "learning_rate": 0.00018306316923923563, + "loss": 1.1851, + "step": 188 + }, + { + "epoch": 0.45929526123936815, + "grad_norm": 61.13632454163589, + "learning_rate": 0.0001828388027949078, + "loss": 1.1323, + "step": 189 + }, + { + "epoch": 0.4617253948967193, + "grad_norm": 67.48617660835801, + "learning_rate": 0.00018261309938733238, + "loss": 1.1956, + "step": 190 + }, + { + "epoch": 0.4641555285540705, + "grad_norm": 38.31182257784929, + "learning_rate": 0.00018238606265921238, + "loss": 1.1379, + "step": 191 + }, + { + "epoch": 0.4665856622114216, + "grad_norm": 47.30995766708629, + "learning_rate": 0.00018215769627476984, + "loss": 1.1462, + "step": 192 + }, + { + "epoch": 0.4690157958687728, + "grad_norm": 34.57093925891121, + "learning_rate": 0.00018192800391968642, + "loss": 1.1979, + "step": 193 + }, + { + "epoch": 0.47144592952612396, + "grad_norm": 34.45645740457662, + "learning_rate": 0.0001816969893010442, + "loss": 1.1763, + "step": 194 + }, + { + "epoch": 0.4738760631834751, + "grad_norm": 39.21862152859671, + "learning_rate": 0.00018146465614726567, + "loss": 1.1514, + "step": 195 + }, + { + "epoch": 0.47630619684082626, + "grad_norm": 34.765347344568106, + "learning_rate": 0.00018123100820805355, + "loss": 1.1426, + "step": 196 + }, + { + "epoch": 0.4787363304981774, + "grad_norm": 35.04245362239315, + "learning_rate": 0.00018099604925433043, + "loss": 1.143, + "step": 197 + }, + { + "epoch": 0.48116646415552855, + "grad_norm": 103.45636476066032, + "learning_rate": 0.00018075978307817764, + "loss": 1.1713, + "step": 198 + }, + { + "epoch": 0.4835965978128797, + "grad_norm": 43.0297373660821, + "learning_rate": 0.00018052221349277442, + "loss": 1.2226, + "step": 199 + }, + { + "epoch": 0.48602673147023084, + "grad_norm": 32.80474372048966, + "learning_rate": 0.000180283344332336, + "loss": 1.1556, + "step": 200 + }, + { + "epoch": 0.488456865127582, + "grad_norm": 59.42688731224296, + "learning_rate": 0.00018004317945205197, + "loss": 1.1411, + "step": 201 + }, + { + "epoch": 0.4908869987849332, + "grad_norm": 102.0917822407188, + "learning_rate": 0.000179801722728024, + "loss": 1.1309, + "step": 202 + }, + { + "epoch": 0.4933171324422843, + "grad_norm": 309.9346821950787, + "learning_rate": 0.0001795589780572031, + "loss": 1.1953, + "step": 203 + }, + { + "epoch": 0.4957472660996355, + "grad_norm": 344.5019267346993, + "learning_rate": 0.0001793149493573271, + "loss": 1.1524, + "step": 204 + }, + { + "epoch": 0.49817739975698666, + "grad_norm": 50.075205946207085, + "learning_rate": 0.00017906964056685706, + "loss": 1.1495, + "step": 205 + }, + { + "epoch": 0.5006075334143378, + "grad_norm": 132.32227258331488, + "learning_rate": 0.00017882305564491396, + "loss": 1.1976, + "step": 206 + }, + { + "epoch": 0.5006075334143378, + "eval_loss": 1.146019458770752, + "eval_runtime": 52.7816, + "eval_samples_per_second": 14.096, + "eval_steps_per_second": 1.762, + "step": 206 + }, + { + "epoch": 0.503037667071689, + "grad_norm": 138.57200377669218, + "learning_rate": 0.00017857519857121458, + "loss": 1.2159, + "step": 207 + }, + { + "epoch": 0.5054678007290401, + "grad_norm": 268.41109734161546, + "learning_rate": 0.00017832607334600746, + "loss": 1.1748, + "step": 208 + }, + { + "epoch": 0.5078979343863913, + "grad_norm": 72.44153953442401, + "learning_rate": 0.00017807568399000822, + "loss": 1.1758, + "step": 209 + }, + { + "epoch": 0.5103280680437424, + "grad_norm": 97.75400124096738, + "learning_rate": 0.00017782403454433477, + "loss": 1.1004, + "step": 210 + }, + { + "epoch": 0.5127582017010935, + "grad_norm": 84.19522802756285, + "learning_rate": 0.000177571129070442, + "loss": 1.1397, + "step": 211 + }, + { + "epoch": 0.5151883353584447, + "grad_norm": 132.95081835535706, + "learning_rate": 0.00017731697165005618, + "loss": 1.146, + "step": 212 + }, + { + "epoch": 0.5176184690157959, + "grad_norm": 560.3351292126325, + "learning_rate": 0.0001770615663851093, + "loss": 1.1937, + "step": 213 + }, + { + "epoch": 0.520048602673147, + "grad_norm": 252.72862614645885, + "learning_rate": 0.0001768049173976727, + "loss": 1.1213, + "step": 214 + }, + { + "epoch": 0.5224787363304981, + "grad_norm": 356.2985211032981, + "learning_rate": 0.0001765470288298905, + "loss": 1.22, + "step": 215 + }, + { + "epoch": 0.5249088699878494, + "grad_norm": 952.600672502031, + "learning_rate": 0.00017628790484391284, + "loss": 1.1321, + "step": 216 + }, + { + "epoch": 0.5273390036452005, + "grad_norm": 289.9357041930161, + "learning_rate": 0.0001760275496218288, + "loss": 1.1688, + "step": 217 + }, + { + "epoch": 0.5297691373025516, + "grad_norm": 48.69445264741508, + "learning_rate": 0.0001757659673655986, + "loss": 1.1551, + "step": 218 + }, + { + "epoch": 0.5321992709599028, + "grad_norm": 40.15160247154335, + "learning_rate": 0.0001755031622969862, + "loss": 1.1459, + "step": 219 + }, + { + "epoch": 0.534629404617254, + "grad_norm": 44.59390817019205, + "learning_rate": 0.00017523913865749078, + "loss": 1.2012, + "step": 220 + }, + { + "epoch": 0.5370595382746051, + "grad_norm": 30.189717624412484, + "learning_rate": 0.00017497390070827848, + "loss": 1.15, + "step": 221 + }, + { + "epoch": 0.5394896719319563, + "grad_norm": 27.185608574176108, + "learning_rate": 0.00017470745273011362, + "loss": 1.0763, + "step": 222 + }, + { + "epoch": 0.5419198055893074, + "grad_norm": 99.44121390806423, + "learning_rate": 0.00017443979902328956, + "loss": 1.1478, + "step": 223 + }, + { + "epoch": 0.5443499392466585, + "grad_norm": 29.684499344634585, + "learning_rate": 0.00017417094390755934, + "loss": 1.1123, + "step": 224 + }, + { + "epoch": 0.5467800729040098, + "grad_norm": 26.788847114635054, + "learning_rate": 0.00017390089172206592, + "loss": 1.1169, + "step": 225 + }, + { + "epoch": 0.5492102065613609, + "grad_norm": 31.84817878214798, + "learning_rate": 0.00017362964682527218, + "loss": 1.1524, + "step": 226 + }, + { + "epoch": 0.551640340218712, + "grad_norm": 34.834632993822424, + "learning_rate": 0.00017335721359489057, + "loss": 1.1761, + "step": 227 + }, + { + "epoch": 0.5540704738760632, + "grad_norm": 66.6084234453716, + "learning_rate": 0.00017308359642781242, + "loss": 1.1175, + "step": 228 + }, + { + "epoch": 0.5565006075334143, + "grad_norm": 35.15720180142773, + "learning_rate": 0.00017280879974003707, + "loss": 1.2012, + "step": 229 + }, + { + "epoch": 0.5589307411907655, + "grad_norm": 35.975450782756226, + "learning_rate": 0.00017253282796660056, + "loss": 1.1801, + "step": 230 + }, + { + "epoch": 0.5613608748481167, + "grad_norm": 83.49050230764925, + "learning_rate": 0.0001722556855615039, + "loss": 1.1576, + "step": 231 + }, + { + "epoch": 0.5637910085054678, + "grad_norm": 150.44630441002784, + "learning_rate": 0.00017197737699764146, + "loss": 1.1826, + "step": 232 + }, + { + "epoch": 0.5662211421628189, + "grad_norm": 31.322382197739042, + "learning_rate": 0.00017169790676672858, + "loss": 1.1784, + "step": 233 + }, + { + "epoch": 0.56865127582017, + "grad_norm": 33.15983653687515, + "learning_rate": 0.0001714172793792291, + "loss": 1.1411, + "step": 234 + }, + { + "epoch": 0.5710814094775213, + "grad_norm": 22.206850165103052, + "learning_rate": 0.0001711354993642827, + "loss": 1.1772, + "step": 235 + }, + { + "epoch": 0.5735115431348724, + "grad_norm": 43.35721272668955, + "learning_rate": 0.00017085257126963152, + "loss": 1.0915, + "step": 236 + }, + { + "epoch": 0.5759416767922235, + "grad_norm": 29.57234737116712, + "learning_rate": 0.0001705684996615472, + "loss": 1.0977, + "step": 237 + }, + { + "epoch": 0.5783718104495748, + "grad_norm": 42.929644875053214, + "learning_rate": 0.00017028328912475668, + "loss": 1.1782, + "step": 238 + }, + { + "epoch": 0.5808019441069259, + "grad_norm": 32.15711272871687, + "learning_rate": 0.0001699969442623686, + "loss": 1.1855, + "step": 239 + }, + { + "epoch": 0.583232077764277, + "grad_norm": 43.64453730184205, + "learning_rate": 0.00016970946969579887, + "loss": 1.1171, + "step": 240 + }, + { + "epoch": 0.5856622114216282, + "grad_norm": 26.145541544112593, + "learning_rate": 0.00016942087006469592, + "loss": 1.1656, + "step": 241 + }, + { + "epoch": 0.5880923450789793, + "grad_norm": 53.98173886095731, + "learning_rate": 0.00016913115002686616, + "loss": 1.1378, + "step": 242 + }, + { + "epoch": 0.5905224787363305, + "grad_norm": 50.851193586801195, + "learning_rate": 0.00016884031425819853, + "loss": 1.1338, + "step": 243 + }, + { + "epoch": 0.5929526123936817, + "grad_norm": 30.166674036386443, + "learning_rate": 0.0001685483674525891, + "loss": 1.1732, + "step": 244 + }, + { + "epoch": 0.5953827460510328, + "grad_norm": 32.580505176392656, + "learning_rate": 0.00016825531432186543, + "loss": 1.143, + "step": 245 + }, + { + "epoch": 0.5978128797083839, + "grad_norm": 35.087231952662634, + "learning_rate": 0.0001679611595957103, + "loss": 1.212, + "step": 246 + }, + { + "epoch": 0.6002430133657352, + "grad_norm": 44.69578306542608, + "learning_rate": 0.00016766590802158566, + "loss": 1.1527, + "step": 247 + }, + { + "epoch": 0.6026731470230863, + "grad_norm": 39.8378839133733, + "learning_rate": 0.00016736956436465573, + "loss": 1.2174, + "step": 248 + }, + { + "epoch": 0.6051032806804374, + "grad_norm": 25.571860004032857, + "learning_rate": 0.0001670721334077103, + "loss": 1.1031, + "step": 249 + }, + { + "epoch": 0.6075334143377886, + "grad_norm": 27.626061413643438, + "learning_rate": 0.00016677361995108743, + "loss": 1.107, + "step": 250 + }, + { + "epoch": 0.6099635479951397, + "grad_norm": 47.405627339857176, + "learning_rate": 0.00016647402881259598, + "loss": 1.1521, + "step": 251 + }, + { + "epoch": 0.6123936816524909, + "grad_norm": 31.951762409660272, + "learning_rate": 0.00016617336482743794, + "loss": 1.174, + "step": 252 + }, + { + "epoch": 0.6148238153098421, + "grad_norm": 44.304437144236104, + "learning_rate": 0.00016587163284813032, + "loss": 1.1286, + "step": 253 + }, + { + "epoch": 0.6172539489671932, + "grad_norm": 21.990501251879344, + "learning_rate": 0.00016556883774442675, + "loss": 1.1927, + "step": 254 + }, + { + "epoch": 0.6196840826245443, + "grad_norm": 43.91119350789936, + "learning_rate": 0.00016526498440323914, + "loss": 1.1399, + "step": 255 + }, + { + "epoch": 0.6221142162818954, + "grad_norm": 28.064569132249982, + "learning_rate": 0.00016496007772855853, + "loss": 1.1913, + "step": 256 + }, + { + "epoch": 0.6245443499392467, + "grad_norm": 99.97142272243896, + "learning_rate": 0.0001646541226413761, + "loss": 1.1694, + "step": 257 + }, + { + "epoch": 0.6269744835965978, + "grad_norm": 27.12524206817854, + "learning_rate": 0.00016434712407960373, + "loss": 1.2398, + "step": 258 + }, + { + "epoch": 0.6294046172539489, + "grad_norm": 42.99171796479219, + "learning_rate": 0.00016403908699799425, + "loss": 1.145, + "step": 259 + }, + { + "epoch": 0.6318347509113001, + "grad_norm": 24.064938768293658, + "learning_rate": 0.00016373001636806153, + "loss": 1.098, + "step": 260 + }, + { + "epoch": 0.6342648845686513, + "grad_norm": 31.72232981247621, + "learning_rate": 0.00016341991717800023, + "loss": 1.1779, + "step": 261 + }, + { + "epoch": 0.6366950182260024, + "grad_norm": 39.97326887390835, + "learning_rate": 0.00016310879443260528, + "loss": 1.3142, + "step": 262 + }, + { + "epoch": 0.6391251518833536, + "grad_norm": 27.519208072826963, + "learning_rate": 0.00016279665315319114, + "loss": 1.2039, + "step": 263 + }, + { + "epoch": 0.6415552855407047, + "grad_norm": 52.94895557810481, + "learning_rate": 0.00016248349837751062, + "loss": 1.1718, + "step": 264 + }, + { + "epoch": 0.6439854191980559, + "grad_norm": 23.603047222747566, + "learning_rate": 0.0001621693351596739, + "loss": 1.1155, + "step": 265 + }, + { + "epoch": 0.6464155528554071, + "grad_norm": 21.400341520569807, + "learning_rate": 0.00016185416857006647, + "loss": 1.1242, + "step": 266 + }, + { + "epoch": 0.6488456865127582, + "grad_norm": 51.167335508822276, + "learning_rate": 0.00016153800369526788, + "loss": 1.1746, + "step": 267 + }, + { + "epoch": 0.6512758201701093, + "grad_norm": 26.219581065473573, + "learning_rate": 0.00016122084563796905, + "loss": 1.0836, + "step": 268 + }, + { + "epoch": 0.6537059538274606, + "grad_norm": 56.820249886600706, + "learning_rate": 0.0001609026995168904, + "loss": 1.1625, + "step": 269 + }, + { + "epoch": 0.6561360874848117, + "grad_norm": 37.43384869992443, + "learning_rate": 0.00016058357046669898, + "loss": 1.2143, + "step": 270 + }, + { + "epoch": 0.6585662211421628, + "grad_norm": 31.885237168871473, + "learning_rate": 0.00016026346363792567, + "loss": 1.1536, + "step": 271 + }, + { + "epoch": 0.660996354799514, + "grad_norm": 34.66147983279251, + "learning_rate": 0.00015994238419688199, + "loss": 1.2095, + "step": 272 + }, + { + "epoch": 0.6634264884568651, + "grad_norm": 86.90365354594917, + "learning_rate": 0.00015962033732557686, + "loss": 1.1149, + "step": 273 + }, + { + "epoch": 0.6658566221142163, + "grad_norm": 52.21177462889067, + "learning_rate": 0.00015929732822163287, + "loss": 1.1861, + "step": 274 + }, + { + "epoch": 0.6682867557715675, + "grad_norm": 92.11184701145604, + "learning_rate": 0.00015897336209820239, + "loss": 1.1853, + "step": 275 + }, + { + "epoch": 0.6707168894289186, + "grad_norm": 30.662475573811115, + "learning_rate": 0.00015864844418388342, + "loss": 1.0912, + "step": 276 + }, + { + "epoch": 0.6731470230862697, + "grad_norm": 26.15855468837027, + "learning_rate": 0.00015832257972263523, + "loss": 1.1618, + "step": 277 + }, + { + "epoch": 0.675577156743621, + "grad_norm": 41.14250673970726, + "learning_rate": 0.00015799577397369375, + "loss": 1.1499, + "step": 278 + }, + { + "epoch": 0.6780072904009721, + "grad_norm": 31.93253644773631, + "learning_rate": 0.00015766803221148673, + "loss": 1.1229, + "step": 279 + }, + { + "epoch": 0.6804374240583232, + "grad_norm": 39.87120131585165, + "learning_rate": 0.00015733935972554844, + "loss": 1.1647, + "step": 280 + }, + { + "epoch": 0.6828675577156743, + "grad_norm": 52.741654062271124, + "learning_rate": 0.0001570097618204345, + "loss": 1.1362, + "step": 281 + }, + { + "epoch": 0.6852976913730255, + "grad_norm": 33.13137686002526, + "learning_rate": 0.0001566792438156362, + "loss": 1.1825, + "step": 282 + }, + { + "epoch": 0.6877278250303767, + "grad_norm": 20.284041564566042, + "learning_rate": 0.00015634781104549442, + "loss": 1.1439, + "step": 283 + }, + { + "epoch": 0.6901579586877278, + "grad_norm": 164.9222932471453, + "learning_rate": 0.00015601546885911404, + "loss": 1.122, + "step": 284 + }, + { + "epoch": 0.692588092345079, + "grad_norm": 27.092346730158148, + "learning_rate": 0.00015568222262027717, + "loss": 1.157, + "step": 285 + }, + { + "epoch": 0.6950182260024301, + "grad_norm": 39.46898996008012, + "learning_rate": 0.00015534807770735664, + "loss": 1.1092, + "step": 286 + }, + { + "epoch": 0.6974483596597812, + "grad_norm": 30.00942949300714, + "learning_rate": 0.00015501303951322943, + "loss": 1.243, + "step": 287 + }, + { + "epoch": 0.6998784933171325, + "grad_norm": 31.435817418038887, + "learning_rate": 0.00015467711344518942, + "loss": 1.1034, + "step": 288 + }, + { + "epoch": 0.7023086269744836, + "grad_norm": 54.53572773177548, + "learning_rate": 0.00015434030492486023, + "loss": 1.2216, + "step": 289 + }, + { + "epoch": 0.7047387606318347, + "grad_norm": 24.51082708234768, + "learning_rate": 0.00015400261938810757, + "loss": 1.1532, + "step": 290 + }, + { + "epoch": 0.707168894289186, + "grad_norm": 104.85480514443172, + "learning_rate": 0.00015366406228495172, + "loss": 1.1156, + "step": 291 + }, + { + "epoch": 0.7095990279465371, + "grad_norm": 26.398830117870997, + "learning_rate": 0.0001533246390794794, + "loss": 1.0934, + "step": 292 + }, + { + "epoch": 0.7120291616038882, + "grad_norm": 25.062392373037707, + "learning_rate": 0.00015298435524975572, + "loss": 1.1453, + "step": 293 + }, + { + "epoch": 0.7144592952612394, + "grad_norm": 25.385505352027444, + "learning_rate": 0.0001526432162877356, + "loss": 1.1359, + "step": 294 + }, + { + "epoch": 0.7168894289185905, + "grad_norm": 18.00146943000571, + "learning_rate": 0.00015230122769917527, + "loss": 1.1129, + "step": 295 + }, + { + "epoch": 0.7193195625759417, + "grad_norm": 22.55383473288135, + "learning_rate": 0.00015195839500354335, + "loss": 1.142, + "step": 296 + }, + { + "epoch": 0.7217496962332929, + "grad_norm": 30.013723395820165, + "learning_rate": 0.00015161472373393186, + "loss": 1.1379, + "step": 297 + }, + { + "epoch": 0.724179829890644, + "grad_norm": 40.566201545240425, + "learning_rate": 0.0001512702194369668, + "loss": 1.1326, + "step": 298 + }, + { + "epoch": 0.7266099635479951, + "grad_norm": 27.34716639907029, + "learning_rate": 0.00015092488767271857, + "loss": 1.0782, + "step": 299 + }, + { + "epoch": 0.7290400972053463, + "grad_norm": 45.0837594669075, + "learning_rate": 0.00015057873401461253, + "loss": 1.2054, + "step": 300 + }, + { + "epoch": 0.7314702308626975, + "grad_norm": 22.39794101270309, + "learning_rate": 0.00015023176404933874, + "loss": 1.1052, + "step": 301 + }, + { + "epoch": 0.7339003645200486, + "grad_norm": 21.818512025585306, + "learning_rate": 0.00014988398337676198, + "loss": 1.1664, + "step": 302 + }, + { + "epoch": 0.7363304981773997, + "grad_norm": 33.09386163968815, + "learning_rate": 0.00014953539760983122, + "loss": 1.1364, + "step": 303 + }, + { + "epoch": 0.7387606318347509, + "grad_norm": 26.3253592215911, + "learning_rate": 0.00014918601237448923, + "loss": 1.1093, + "step": 304 + }, + { + "epoch": 0.741190765492102, + "grad_norm": 32.54878723405212, + "learning_rate": 0.0001488358333095816, + "loss": 1.182, + "step": 305 + }, + { + "epoch": 0.7436208991494532, + "grad_norm": 28.645473311846015, + "learning_rate": 0.0001484848660667658, + "loss": 1.2064, + "step": 306 + }, + { + "epoch": 0.7460510328068044, + "grad_norm": 29.02693042820854, + "learning_rate": 0.00014813311631041995, + "loss": 1.1545, + "step": 307 + }, + { + "epoch": 0.7484811664641555, + "grad_norm": 20.28193033099828, + "learning_rate": 0.00014778058971755154, + "loss": 1.1885, + "step": 308 + }, + { + "epoch": 0.7509113001215066, + "grad_norm": 121.86121371804961, + "learning_rate": 0.00014742729197770552, + "loss": 1.095, + "step": 309 + }, + { + "epoch": 0.7509113001215066, + "eval_loss": 1.133868932723999, + "eval_runtime": 52.6711, + "eval_samples_per_second": 14.125, + "eval_steps_per_second": 1.766, + "step": 309 + }, + { + "epoch": 0.7533414337788579, + "grad_norm": 50.1793074315811, + "learning_rate": 0.00014707322879287276, + "loss": 1.1679, + "step": 310 + }, + { + "epoch": 0.755771567436209, + "grad_norm": 31.791309498678103, + "learning_rate": 0.00014671840587739783, + "loss": 1.1277, + "step": 311 + }, + { + "epoch": 0.7582017010935601, + "grad_norm": 56.88911226488106, + "learning_rate": 0.00014636282895788688, + "loss": 1.1492, + "step": 312 + }, + { + "epoch": 0.7606318347509113, + "grad_norm": 117.29437608667352, + "learning_rate": 0.00014600650377311522, + "loss": 1.1123, + "step": 313 + }, + { + "epoch": 0.7630619684082625, + "grad_norm": 107.56728772749254, + "learning_rate": 0.00014564943607393459, + "loss": 1.171, + "step": 314 + }, + { + "epoch": 0.7654921020656136, + "grad_norm": 34.085830256919685, + "learning_rate": 0.0001452916316231805, + "loss": 1.1854, + "step": 315 + }, + { + "epoch": 0.7679222357229648, + "grad_norm": 23.625747202851176, + "learning_rate": 0.000144933096195579, + "loss": 1.1622, + "step": 316 + }, + { + "epoch": 0.7703523693803159, + "grad_norm": 56.9917185309248, + "learning_rate": 0.00014457383557765386, + "loss": 1.2037, + "step": 317 + }, + { + "epoch": 0.772782503037667, + "grad_norm": 34.55554043725056, + "learning_rate": 0.00014421385556763266, + "loss": 1.1273, + "step": 318 + }, + { + "epoch": 0.7752126366950183, + "grad_norm": 34.205286759913115, + "learning_rate": 0.00014385316197535372, + "loss": 1.2039, + "step": 319 + }, + { + "epoch": 0.7776427703523694, + "grad_norm": 27.30015395778206, + "learning_rate": 0.00014349176062217195, + "loss": 1.1903, + "step": 320 + }, + { + "epoch": 0.7800729040097205, + "grad_norm": 23.077745147127867, + "learning_rate": 0.00014312965734086518, + "loss": 1.1539, + "step": 321 + }, + { + "epoch": 0.7825030376670717, + "grad_norm": 26.22112568156326, + "learning_rate": 0.00014276685797553977, + "loss": 1.1807, + "step": 322 + }, + { + "epoch": 0.7849331713244229, + "grad_norm": 34.813719314948514, + "learning_rate": 0.0001424033683815365, + "loss": 1.1247, + "step": 323 + }, + { + "epoch": 0.787363304981774, + "grad_norm": 27.109609629038324, + "learning_rate": 0.00014203919442533597, + "loss": 1.1735, + "step": 324 + }, + { + "epoch": 0.7897934386391251, + "grad_norm": 144.91672798575476, + "learning_rate": 0.00014167434198446383, + "loss": 1.1007, + "step": 325 + }, + { + "epoch": 0.7922235722964763, + "grad_norm": 42.19042828736382, + "learning_rate": 0.00014130881694739616, + "loss": 1.1398, + "step": 326 + }, + { + "epoch": 0.7946537059538274, + "grad_norm": 43.00144921766715, + "learning_rate": 0.00014094262521346427, + "loss": 1.1712, + "step": 327 + }, + { + "epoch": 0.7970838396111786, + "grad_norm": 26.343159670729925, + "learning_rate": 0.0001405757726927595, + "loss": 1.2103, + "step": 328 + }, + { + "epoch": 0.7995139732685298, + "grad_norm": 31.68271222195729, + "learning_rate": 0.00014020826530603776, + "loss": 1.1578, + "step": 329 + }, + { + "epoch": 0.8019441069258809, + "grad_norm": 39.08920292536896, + "learning_rate": 0.00013984010898462416, + "loss": 1.1377, + "step": 330 + }, + { + "epoch": 0.804374240583232, + "grad_norm": 34.56898084569197, + "learning_rate": 0.00013947130967031717, + "loss": 1.1886, + "step": 331 + }, + { + "epoch": 0.8068043742405833, + "grad_norm": 42.016356369933895, + "learning_rate": 0.00013910187331529276, + "loss": 1.1577, + "step": 332 + }, + { + "epoch": 0.8092345078979344, + "grad_norm": 21.25953597879822, + "learning_rate": 0.00013873180588200827, + "loss": 1.1259, + "step": 333 + }, + { + "epoch": 0.8116646415552855, + "grad_norm": 39.49634140985428, + "learning_rate": 0.0001383611133431062, + "loss": 1.173, + "step": 334 + }, + { + "epoch": 0.8140947752126367, + "grad_norm": 29.837690582268863, + "learning_rate": 0.00013798980168131794, + "loss": 1.1322, + "step": 335 + }, + { + "epoch": 0.8165249088699879, + "grad_norm": 23.510451396240928, + "learning_rate": 0.000137617876889367, + "loss": 1.1392, + "step": 336 + }, + { + "epoch": 0.818955042527339, + "grad_norm": 19.183017199526635, + "learning_rate": 0.00013724534496987247, + "loss": 1.157, + "step": 337 + }, + { + "epoch": 0.8213851761846902, + "grad_norm": 51.85037647612581, + "learning_rate": 0.0001368722119352521, + "loss": 1.1255, + "step": 338 + }, + { + "epoch": 0.8238153098420413, + "grad_norm": 31.635699477838273, + "learning_rate": 0.00013649848380762513, + "loss": 1.1429, + "step": 339 + }, + { + "epoch": 0.8262454434993924, + "grad_norm": 39.6479124739029, + "learning_rate": 0.00013612416661871533, + "loss": 1.1609, + "step": 340 + }, + { + "epoch": 0.8286755771567437, + "grad_norm": 21.453228401011238, + "learning_rate": 0.0001357492664097534, + "loss": 1.1247, + "step": 341 + }, + { + "epoch": 0.8311057108140948, + "grad_norm": 28.514958428145494, + "learning_rate": 0.00013537378923137973, + "loss": 1.0845, + "step": 342 + }, + { + "epoch": 0.8335358444714459, + "grad_norm": 26.98663985253516, + "learning_rate": 0.00013499774114354655, + "loss": 1.1092, + "step": 343 + }, + { + "epoch": 0.8359659781287971, + "grad_norm": 30.76143424141064, + "learning_rate": 0.00013462112821542016, + "loss": 1.1759, + "step": 344 + }, + { + "epoch": 0.8383961117861483, + "grad_norm": 39.023771167108656, + "learning_rate": 0.0001342439565252831, + "loss": 1.1024, + "step": 345 + }, + { + "epoch": 0.8408262454434994, + "grad_norm": 29.787639099820225, + "learning_rate": 0.0001338662321604358, + "loss": 1.2141, + "step": 346 + }, + { + "epoch": 0.8432563791008505, + "grad_norm": 25.60634301240642, + "learning_rate": 0.00013348796121709862, + "loss": 1.1244, + "step": 347 + }, + { + "epoch": 0.8456865127582017, + "grad_norm": 76.98542857181108, + "learning_rate": 0.00013310914980031334, + "loss": 1.19, + "step": 348 + }, + { + "epoch": 0.8481166464155528, + "grad_norm": 110.28982985071892, + "learning_rate": 0.0001327298040238446, + "loss": 1.1295, + "step": 349 + }, + { + "epoch": 0.850546780072904, + "grad_norm": 22.610631125609732, + "learning_rate": 0.0001323499300100811, + "loss": 1.1445, + "step": 350 + }, + { + "epoch": 0.8529769137302552, + "grad_norm": 29.958515973723888, + "learning_rate": 0.00013196953388993726, + "loss": 1.2048, + "step": 351 + }, + { + "epoch": 0.8554070473876063, + "grad_norm": 30.691798031468103, + "learning_rate": 0.00013158862180275363, + "loss": 1.1628, + "step": 352 + }, + { + "epoch": 0.8578371810449574, + "grad_norm": 28.568576369680258, + "learning_rate": 0.00013120719989619833, + "loss": 1.0899, + "step": 353 + }, + { + "epoch": 0.8602673147023087, + "grad_norm": 42.12623456189728, + "learning_rate": 0.0001308252743261675, + "loss": 1.1451, + "step": 354 + }, + { + "epoch": 0.8626974483596598, + "grad_norm": 112.39248005736448, + "learning_rate": 0.00013044285125668614, + "loss": 1.154, + "step": 355 + }, + { + "epoch": 0.8651275820170109, + "grad_norm": 28.013602355549782, + "learning_rate": 0.0001300599368598086, + "loss": 1.1937, + "step": 356 + }, + { + "epoch": 0.8675577156743621, + "grad_norm": 27.763517972300694, + "learning_rate": 0.0001296765373155188, + "loss": 1.1243, + "step": 357 + }, + { + "epoch": 0.8699878493317132, + "grad_norm": 112.85815824767063, + "learning_rate": 0.0001292926588116308, + "loss": 1.1595, + "step": 358 + }, + { + "epoch": 0.8724179829890644, + "grad_norm": 27.085127886556087, + "learning_rate": 0.00012890830754368855, + "loss": 1.1196, + "step": 359 + }, + { + "epoch": 0.8748481166464156, + "grad_norm": 31.56336829128541, + "learning_rate": 0.00012852348971486617, + "loss": 1.1231, + "step": 360 + }, + { + "epoch": 0.8772782503037667, + "grad_norm": 31.904393738907178, + "learning_rate": 0.0001281382115358679, + "loss": 1.097, + "step": 361 + }, + { + "epoch": 0.8797083839611178, + "grad_norm": 25.034453894065827, + "learning_rate": 0.00012775247922482748, + "loss": 1.1246, + "step": 362 + }, + { + "epoch": 0.8821385176184691, + "grad_norm": 33.221958266501474, + "learning_rate": 0.0001273662990072083, + "loss": 1.1189, + "step": 363 + }, + { + "epoch": 0.8845686512758202, + "grad_norm": 26.638980136773224, + "learning_rate": 0.00012697967711570242, + "loss": 1.1315, + "step": 364 + }, + { + "epoch": 0.8869987849331713, + "grad_norm": 27.231479341362885, + "learning_rate": 0.00012659261979013043, + "loss": 1.1464, + "step": 365 + }, + { + "epoch": 0.8894289185905225, + "grad_norm": 19.654091006710207, + "learning_rate": 0.0001262051332773404, + "loss": 1.1271, + "step": 366 + }, + { + "epoch": 0.8918590522478737, + "grad_norm": 50.3934263865559, + "learning_rate": 0.00012581722383110718, + "loss": 1.1002, + "step": 367 + }, + { + "epoch": 0.8942891859052248, + "grad_norm": 20.25952031318632, + "learning_rate": 0.00012542889771203166, + "loss": 1.0629, + "step": 368 + }, + { + "epoch": 0.8967193195625759, + "grad_norm": 19.16914945262315, + "learning_rate": 0.00012504016118743935, + "loss": 1.1597, + "step": 369 + }, + { + "epoch": 0.8991494532199271, + "grad_norm": 35.65941460173898, + "learning_rate": 0.00012465102053127957, + "loss": 1.1501, + "step": 370 + }, + { + "epoch": 0.9015795868772782, + "grad_norm": 26.093269180565315, + "learning_rate": 0.00012426148202402404, + "loss": 1.1455, + "step": 371 + }, + { + "epoch": 0.9040097205346294, + "grad_norm": 30.928987547424892, + "learning_rate": 0.00012387155195256537, + "loss": 1.1392, + "step": 372 + }, + { + "epoch": 0.9064398541919806, + "grad_norm": 20.17512596846915, + "learning_rate": 0.00012348123661011601, + "loss": 1.1196, + "step": 373 + }, + { + "epoch": 0.9088699878493317, + "grad_norm": 24.380789157356805, + "learning_rate": 0.00012309054229610623, + "loss": 1.1, + "step": 374 + }, + { + "epoch": 0.9113001215066828, + "grad_norm": 95.49408387682203, + "learning_rate": 0.00012269947531608276, + "loss": 1.1825, + "step": 375 + }, + { + "epoch": 0.913730255164034, + "grad_norm": 23.635286340368726, + "learning_rate": 0.0001223080419816069, + "loss": 1.1717, + "step": 376 + }, + { + "epoch": 0.9161603888213852, + "grad_norm": 21.942478063568313, + "learning_rate": 0.00012191624861015254, + "loss": 1.1661, + "step": 377 + }, + { + "epoch": 0.9185905224787363, + "grad_norm": 74.12601397150299, + "learning_rate": 0.00012152410152500453, + "loss": 1.1967, + "step": 378 + }, + { + "epoch": 0.9210206561360875, + "grad_norm": 37.26720386499629, + "learning_rate": 0.00012113160705515625, + "loss": 1.1566, + "step": 379 + }, + { + "epoch": 0.9234507897934386, + "grad_norm": 34.080854733427635, + "learning_rate": 0.00012073877153520776, + "loss": 1.0847, + "step": 380 + }, + { + "epoch": 0.9258809234507898, + "grad_norm": 26.50842916877183, + "learning_rate": 0.0001203456013052634, + "loss": 1.0824, + "step": 381 + }, + { + "epoch": 0.928311057108141, + "grad_norm": 37.92039651416441, + "learning_rate": 0.00011995210271082944, + "loss": 1.1485, + "step": 382 + }, + { + "epoch": 0.9307411907654921, + "grad_norm": 38.56931832374284, + "learning_rate": 0.00011955828210271187, + "loss": 1.0737, + "step": 383 + }, + { + "epoch": 0.9331713244228432, + "grad_norm": 24.419015296791592, + "learning_rate": 0.0001191641458369136, + "loss": 1.1208, + "step": 384 + }, + { + "epoch": 0.9356014580801945, + "grad_norm": 28.75379656643836, + "learning_rate": 0.00011876970027453222, + "loss": 1.1071, + "step": 385 + }, + { + "epoch": 0.9380315917375456, + "grad_norm": 138.39305133994282, + "learning_rate": 0.00011837495178165706, + "loss": 1.1405, + "step": 386 + }, + { + "epoch": 0.9404617253948967, + "grad_norm": 22.200435229928654, + "learning_rate": 0.00011797990672926652, + "loss": 1.124, + "step": 387 + }, + { + "epoch": 0.9428918590522479, + "grad_norm": 40.21978055156661, + "learning_rate": 0.00011758457149312538, + "loss": 1.1875, + "step": 388 + }, + { + "epoch": 0.945321992709599, + "grad_norm": 23.592672098002485, + "learning_rate": 0.00011718895245368167, + "loss": 1.1748, + "step": 389 + }, + { + "epoch": 0.9477521263669502, + "grad_norm": 17.463183827323444, + "learning_rate": 0.00011679305599596393, + "loss": 1.1794, + "step": 390 + }, + { + "epoch": 0.9501822600243013, + "grad_norm": 36.219441964332646, + "learning_rate": 0.00011639688850947799, + "loss": 1.1459, + "step": 391 + }, + { + "epoch": 0.9526123936816525, + "grad_norm": 23.727472560980413, + "learning_rate": 0.00011600045638810386, + "loss": 1.076, + "step": 392 + }, + { + "epoch": 0.9550425273390036, + "grad_norm": 57.63284414960702, + "learning_rate": 0.00011560376602999272, + "loss": 1.1919, + "step": 393 + }, + { + "epoch": 0.9574726609963548, + "grad_norm": 40.23829998466358, + "learning_rate": 0.00011520682383746333, + "loss": 1.0701, + "step": 394 + }, + { + "epoch": 0.959902794653706, + "grad_norm": 58.2018640218209, + "learning_rate": 0.00011480963621689905, + "loss": 1.1745, + "step": 395 + }, + { + "epoch": 0.9623329283110571, + "grad_norm": 27.693448904288406, + "learning_rate": 0.00011441220957864421, + "loss": 1.1323, + "step": 396 + }, + { + "epoch": 0.9647630619684082, + "grad_norm": 34.94430005820724, + "learning_rate": 0.00011401455033690076, + "loss": 1.1497, + "step": 397 + }, + { + "epoch": 0.9671931956257594, + "grad_norm": 17.521922247865188, + "learning_rate": 0.00011361666490962468, + "loss": 1.1319, + "step": 398 + }, + { + "epoch": 0.9696233292831106, + "grad_norm": 25.886687159935246, + "learning_rate": 0.00011321855971842243, + "loss": 1.1418, + "step": 399 + }, + { + "epoch": 0.9720534629404617, + "grad_norm": 31.388154506614836, + "learning_rate": 0.00011282024118844738, + "loss": 1.1282, + "step": 400 + }, + { + "epoch": 0.9744835965978129, + "grad_norm": 27.458601253675347, + "learning_rate": 0.00011242171574829599, + "loss": 1.1647, + "step": 401 + }, + { + "epoch": 0.976913730255164, + "grad_norm": 25.922873022924257, + "learning_rate": 0.00011202298982990411, + "loss": 1.091, + "step": 402 + }, + { + "epoch": 0.9793438639125152, + "grad_norm": 20.129467589894766, + "learning_rate": 0.00011162406986844323, + "loss": 1.2, + "step": 403 + }, + { + "epoch": 0.9817739975698664, + "grad_norm": 25.11892123906363, + "learning_rate": 0.00011122496230221645, + "loss": 1.0731, + "step": 404 + }, + { + "epoch": 0.9842041312272175, + "grad_norm": 26.416884392453543, + "learning_rate": 0.00011082567357255484, + "loss": 1.1836, + "step": 405 + }, + { + "epoch": 0.9866342648845686, + "grad_norm": 18.768078773975784, + "learning_rate": 0.00011042621012371322, + "loss": 1.1275, + "step": 406 + }, + { + "epoch": 0.9890643985419199, + "grad_norm": 22.275756523796257, + "learning_rate": 0.00011002657840276627, + "loss": 1.1228, + "step": 407 + }, + { + "epoch": 0.991494532199271, + "grad_norm": 29.605335344828575, + "learning_rate": 0.00010962678485950455, + "loss": 1.0255, + "step": 408 + }, + { + "epoch": 0.9939246658566221, + "grad_norm": 41.1718200727633, + "learning_rate": 0.00010922683594633021, + "loss": 1.1876, + "step": 409 + }, + { + "epoch": 0.9963547995139733, + "grad_norm": 20.46397475257922, + "learning_rate": 0.00010882673811815304, + "loss": 1.1168, + "step": 410 + }, + { + "epoch": 0.9987849331713244, + "grad_norm": 21.084924025016928, + "learning_rate": 0.00010842649783228624, + "loss": 1.1948, + "step": 411 + }, + { + "epoch": 1.0, + "grad_norm": 21.084924025016928, + "learning_rate": 0.00010802612154834211, + "loss": 1.1076, + "step": 412 + }, + { + "epoch": 1.0, + "eval_loss": 1.121336579322815, + "eval_runtime": 52.7043, + "eval_samples_per_second": 14.116, + "eval_steps_per_second": 1.765, + "step": 412 + }, + { + "epoch": 1.0024301336573511, + "grad_norm": 35.25758968935371, + "learning_rate": 0.00010762561572812788, + "loss": 1.1335, + "step": 413 + }, + { + "epoch": 1.0048602673147022, + "grad_norm": 20.78715726366623, + "learning_rate": 0.0001072249868355415, + "loss": 1.1003, + "step": 414 + }, + { + "epoch": 1.0072904009720534, + "grad_norm": 31.01116633763719, + "learning_rate": 0.0001068242413364671, + "loss": 1.1225, + "step": 415 + }, + { + "epoch": 1.0097205346294047, + "grad_norm": 19.050638172672897, + "learning_rate": 0.00010642338569867086, + "loss": 1.0595, + "step": 416 + }, + { + "epoch": 1.0121506682867558, + "grad_norm": 41.54235389574412, + "learning_rate": 0.00010602242639169648, + "loss": 1.1719, + "step": 417 + }, + { + "epoch": 1.014580801944107, + "grad_norm": 41.34218206464363, + "learning_rate": 0.00010562136988676078, + "loss": 1.1292, + "step": 418 + }, + { + "epoch": 1.017010935601458, + "grad_norm": 32.436985934581934, + "learning_rate": 0.0001052202226566494, + "loss": 1.1244, + "step": 419 + }, + { + "epoch": 1.0194410692588092, + "grad_norm": 19.631825450596665, + "learning_rate": 0.0001048189911756121, + "loss": 1.1323, + "step": 420 + }, + { + "epoch": 1.0218712029161603, + "grad_norm": 23.275029440216805, + "learning_rate": 0.00010441768191925847, + "loss": 1.1605, + "step": 421 + }, + { + "epoch": 1.0243013365735116, + "grad_norm": 21.44161988455765, + "learning_rate": 0.0001040163013644533, + "loss": 1.0886, + "step": 422 + }, + { + "epoch": 1.0267314702308628, + "grad_norm": 31.9765167465431, + "learning_rate": 0.00010361485598921212, + "loss": 1.1378, + "step": 423 + }, + { + "epoch": 1.0291616038882139, + "grad_norm": 22.340741556027833, + "learning_rate": 0.00010321335227259661, + "loss": 1.1278, + "step": 424 + }, + { + "epoch": 1.031591737545565, + "grad_norm": 29.27286563037163, + "learning_rate": 0.00010281179669461005, + "loss": 1.1186, + "step": 425 + }, + { + "epoch": 1.034021871202916, + "grad_norm": 65.85877610734141, + "learning_rate": 0.00010241019573609269, + "loss": 1.1673, + "step": 426 + }, + { + "epoch": 1.0364520048602672, + "grad_norm": 35.173784527846884, + "learning_rate": 0.00010200855587861724, + "loss": 1.0903, + "step": 427 + }, + { + "epoch": 1.0388821385176186, + "grad_norm": 29.91546238299385, + "learning_rate": 0.00010160688360438419, + "loss": 1.0884, + "step": 428 + }, + { + "epoch": 1.0413122721749697, + "grad_norm": 26.873308685100223, + "learning_rate": 0.0001012051853961172, + "loss": 1.1296, + "step": 429 + }, + { + "epoch": 1.0437424058323208, + "grad_norm": 25.90622275527891, + "learning_rate": 0.00010080346773695853, + "loss": 1.1349, + "step": 430 + }, + { + "epoch": 1.046172539489672, + "grad_norm": 21.388851321680434, + "learning_rate": 0.00010040173711036431, + "loss": 1.0947, + "step": 431 + }, + { + "epoch": 1.048602673147023, + "grad_norm": 31.206506843880053, + "learning_rate": 0.0001, + "loss": 1.1541, + "step": 432 + }, + { + "epoch": 1.0510328068043742, + "grad_norm": 19.486767323523555, + "learning_rate": 9.959826288963571e-05, + "loss": 1.1574, + "step": 433 + }, + { + "epoch": 1.0534629404617255, + "grad_norm": 102.81325604770561, + "learning_rate": 9.919653226304148e-05, + "loss": 1.1762, + "step": 434 + }, + { + "epoch": 1.0558930741190766, + "grad_norm": 17.18170280255333, + "learning_rate": 9.879481460388282e-05, + "loss": 1.1208, + "step": 435 + }, + { + "epoch": 1.0583232077764277, + "grad_norm": 29.88292309614927, + "learning_rate": 9.839311639561583e-05, + "loss": 1.1114, + "step": 436 + }, + { + "epoch": 1.0607533414337789, + "grad_norm": 23.50392429976475, + "learning_rate": 9.799144412138275e-05, + "loss": 1.2026, + "step": 437 + }, + { + "epoch": 1.06318347509113, + "grad_norm": 24.794408487434744, + "learning_rate": 9.758980426390732e-05, + "loss": 1.1587, + "step": 438 + }, + { + "epoch": 1.065613608748481, + "grad_norm": 38.726295800289655, + "learning_rate": 9.718820330538998e-05, + "loss": 1.14, + "step": 439 + }, + { + "epoch": 1.0680437424058322, + "grad_norm": 31.152256057732977, + "learning_rate": 9.678664772740343e-05, + "loss": 1.0882, + "step": 440 + }, + { + "epoch": 1.0704738760631836, + "grad_norm": 65.73380095432839, + "learning_rate": 9.638514401078788e-05, + "loss": 1.1213, + "step": 441 + }, + { + "epoch": 1.0729040097205347, + "grad_norm": 69.07317297910537, + "learning_rate": 9.598369863554673e-05, + "loss": 1.1285, + "step": 442 + }, + { + "epoch": 1.0753341433778858, + "grad_norm": 62.55969576940585, + "learning_rate": 9.558231808074156e-05, + "loss": 1.1252, + "step": 443 + }, + { + "epoch": 1.077764277035237, + "grad_norm": 26.35106444530265, + "learning_rate": 9.51810088243879e-05, + "loss": 1.108, + "step": 444 + }, + { + "epoch": 1.080194410692588, + "grad_norm": 76.70006955440516, + "learning_rate": 9.477977734335061e-05, + "loss": 1.1144, + "step": 445 + }, + { + "epoch": 1.0826245443499392, + "grad_norm": 22.376983523395264, + "learning_rate": 9.437863011323922e-05, + "loss": 1.173, + "step": 446 + }, + { + "epoch": 1.0850546780072905, + "grad_norm": 33.51322062360491, + "learning_rate": 9.397757360830353e-05, + "loss": 1.089, + "step": 447 + }, + { + "epoch": 1.0874848116646416, + "grad_norm": 24.87252097324779, + "learning_rate": 9.357661430132915e-05, + "loss": 1.098, + "step": 448 + }, + { + "epoch": 1.0899149453219927, + "grad_norm": 48.95371674408058, + "learning_rate": 9.317575866353292e-05, + "loss": 1.0491, + "step": 449 + }, + { + "epoch": 1.0923450789793439, + "grad_norm": 25.50740340531524, + "learning_rate": 9.277501316445854e-05, + "loss": 1.0939, + "step": 450 + }, + { + "epoch": 1.094775212636695, + "grad_norm": 27.60998778610316, + "learning_rate": 9.23743842718721e-05, + "loss": 1.1564, + "step": 451 + }, + { + "epoch": 1.097205346294046, + "grad_norm": 63.99226186124907, + "learning_rate": 9.197387845165793e-05, + "loss": 1.1088, + "step": 452 + }, + { + "epoch": 1.0996354799513974, + "grad_norm": 36.441157466567596, + "learning_rate": 9.157350216771378e-05, + "loss": 1.0897, + "step": 453 + }, + { + "epoch": 1.1020656136087486, + "grad_norm": 32.32587774153429, + "learning_rate": 9.117326188184695e-05, + "loss": 1.1285, + "step": 454 + }, + { + "epoch": 1.1044957472660997, + "grad_norm": 33.39257750037465, + "learning_rate": 9.077316405366981e-05, + "loss": 1.1568, + "step": 455 + }, + { + "epoch": 1.1069258809234508, + "grad_norm": 45.03485873480868, + "learning_rate": 9.037321514049548e-05, + "loss": 1.0791, + "step": 456 + }, + { + "epoch": 1.109356014580802, + "grad_norm": 35.1451377482015, + "learning_rate": 8.997342159723371e-05, + "loss": 1.1243, + "step": 457 + }, + { + "epoch": 1.111786148238153, + "grad_norm": 67.01465976966, + "learning_rate": 8.957378987628682e-05, + "loss": 1.0978, + "step": 458 + }, + { + "epoch": 1.1142162818955041, + "grad_norm": 33.057859846207634, + "learning_rate": 8.917432642744518e-05, + "loss": 1.1431, + "step": 459 + }, + { + "epoch": 1.1166464155528555, + "grad_norm": 30.602840863536635, + "learning_rate": 8.877503769778356e-05, + "loss": 1.1157, + "step": 460 + }, + { + "epoch": 1.1190765492102066, + "grad_norm": 38.088467248288964, + "learning_rate": 8.83759301315568e-05, + "loss": 1.0776, + "step": 461 + }, + { + "epoch": 1.1215066828675577, + "grad_norm": 66.03671829863266, + "learning_rate": 8.797701017009591e-05, + "loss": 1.1468, + "step": 462 + }, + { + "epoch": 1.1239368165249088, + "grad_norm": 32.293691874682686, + "learning_rate": 8.757828425170404e-05, + "loss": 1.1115, + "step": 463 + }, + { + "epoch": 1.12636695018226, + "grad_norm": 32.70707175332633, + "learning_rate": 8.717975881155261e-05, + "loss": 1.1677, + "step": 464 + }, + { + "epoch": 1.128797083839611, + "grad_norm": 48.79069594971439, + "learning_rate": 8.678144028157759e-05, + "loss": 1.1341, + "step": 465 + }, + { + "epoch": 1.1312272174969624, + "grad_norm": 37.52808559072613, + "learning_rate": 8.638333509037536e-05, + "loss": 1.1414, + "step": 466 + }, + { + "epoch": 1.1336573511543135, + "grad_norm": 27.096068124970536, + "learning_rate": 8.598544966309925e-05, + "loss": 1.1719, + "step": 467 + }, + { + "epoch": 1.1360874848116647, + "grad_norm": 16.019227077248434, + "learning_rate": 8.55877904213558e-05, + "loss": 1.1148, + "step": 468 + }, + { + "epoch": 1.1385176184690158, + "grad_norm": 29.861941956913498, + "learning_rate": 8.519036378310096e-05, + "loss": 1.1486, + "step": 469 + }, + { + "epoch": 1.140947752126367, + "grad_norm": 23.058998452019107, + "learning_rate": 8.47931761625367e-05, + "loss": 1.0745, + "step": 470 + }, + { + "epoch": 1.143377885783718, + "grad_norm": 24.486692418227875, + "learning_rate": 8.43962339700073e-05, + "loss": 1.1333, + "step": 471 + }, + { + "epoch": 1.1458080194410694, + "grad_norm": 31.632544516924323, + "learning_rate": 8.399954361189615e-05, + "loss": 1.1565, + "step": 472 + }, + { + "epoch": 1.1482381530984205, + "grad_norm": 21.67735267443374, + "learning_rate": 8.360311149052205e-05, + "loss": 1.109, + "step": 473 + }, + { + "epoch": 1.1506682867557716, + "grad_norm": 29.096918560226527, + "learning_rate": 8.320694400403606e-05, + "loss": 1.1517, + "step": 474 + }, + { + "epoch": 1.1530984204131227, + "grad_norm": 46.067313216206955, + "learning_rate": 8.281104754631835e-05, + "loss": 1.1043, + "step": 475 + }, + { + "epoch": 1.1555285540704738, + "grad_norm": 30.84953769166141, + "learning_rate": 8.241542850687465e-05, + "loss": 1.1081, + "step": 476 + }, + { + "epoch": 1.157958687727825, + "grad_norm": 39.34158523904847, + "learning_rate": 8.20200932707335e-05, + "loss": 1.1787, + "step": 477 + }, + { + "epoch": 1.160388821385176, + "grad_norm": 39.14663302484904, + "learning_rate": 8.162504821834295e-05, + "loss": 1.202, + "step": 478 + }, + { + "epoch": 1.1628189550425274, + "grad_norm": 49.7279004249915, + "learning_rate": 8.123029972546781e-05, + "loss": 1.1439, + "step": 479 + }, + { + "epoch": 1.1652490886998785, + "grad_norm": 35.49897960878779, + "learning_rate": 8.083585416308642e-05, + "loss": 1.0741, + "step": 480 + }, + { + "epoch": 1.1676792223572297, + "grad_norm": 31.306252618855535, + "learning_rate": 8.044171789728816e-05, + "loss": 1.0697, + "step": 481 + }, + { + "epoch": 1.1701093560145808, + "grad_norm": 22.40745672651249, + "learning_rate": 8.004789728917059e-05, + "loss": 1.1498, + "step": 482 + }, + { + "epoch": 1.172539489671932, + "grad_norm": 32.19326746671122, + "learning_rate": 7.965439869473664e-05, + "loss": 1.1392, + "step": 483 + }, + { + "epoch": 1.1749696233292832, + "grad_norm": 33.66876390791385, + "learning_rate": 7.926122846479224e-05, + "loss": 1.1049, + "step": 484 + }, + { + "epoch": 1.1773997569866343, + "grad_norm": 35.43357233261174, + "learning_rate": 7.886839294484377e-05, + "loss": 1.0467, + "step": 485 + }, + { + "epoch": 1.1798298906439855, + "grad_norm": 50.660998166256256, + "learning_rate": 7.84758984749955e-05, + "loss": 1.1244, + "step": 486 + }, + { + "epoch": 1.1822600243013366, + "grad_norm": 41.356845334605936, + "learning_rate": 7.808375138984745e-05, + "loss": 1.1279, + "step": 487 + }, + { + "epoch": 1.1846901579586877, + "grad_norm": 22.947663723281487, + "learning_rate": 7.769195801839313e-05, + "loss": 1.0787, + "step": 488 + }, + { + "epoch": 1.1871202916160388, + "grad_norm": 36.434647074399905, + "learning_rate": 7.730052468391725e-05, + "loss": 1.1148, + "step": 489 + }, + { + "epoch": 1.18955042527339, + "grad_norm": 75.94549877059467, + "learning_rate": 7.690945770389377e-05, + "loss": 1.1127, + "step": 490 + }, + { + "epoch": 1.1919805589307413, + "grad_norm": 68.03126664734435, + "learning_rate": 7.6518763389884e-05, + "loss": 1.1672, + "step": 491 + }, + { + "epoch": 1.1944106925880924, + "grad_norm": 40.15361719091623, + "learning_rate": 7.612844804743466e-05, + "loss": 1.0962, + "step": 492 + }, + { + "epoch": 1.1968408262454435, + "grad_norm": 105.80023571763755, + "learning_rate": 7.573851797597602e-05, + "loss": 1.1091, + "step": 493 + }, + { + "epoch": 1.1992709599027946, + "grad_norm": 41.84401502420881, + "learning_rate": 7.534897946872042e-05, + "loss": 1.1359, + "step": 494 + }, + { + "epoch": 1.2017010935601458, + "grad_norm": 21.985533615468846, + "learning_rate": 7.495983881256067e-05, + "loss": 1.1024, + "step": 495 + }, + { + "epoch": 1.2041312272174969, + "grad_norm": 23.02649898605792, + "learning_rate": 7.457110228796838e-05, + "loss": 1.1089, + "step": 496 + }, + { + "epoch": 1.206561360874848, + "grad_norm": 74.4950498938832, + "learning_rate": 7.418277616889282e-05, + "loss": 1.0439, + "step": 497 + }, + { + "epoch": 1.2089914945321993, + "grad_norm": 27.637660484960865, + "learning_rate": 7.379486672265964e-05, + "loss": 1.1453, + "step": 498 + }, + { + "epoch": 1.2114216281895505, + "grad_norm": 34.98561655821008, + "learning_rate": 7.340738020986961e-05, + "loss": 1.139, + "step": 499 + }, + { + "epoch": 1.2138517618469016, + "grad_norm": 28.47627677351389, + "learning_rate": 7.302032288429756e-05, + "loss": 1.0623, + "step": 500 + }, + { + "epoch": 1.2162818955042527, + "grad_norm": 39.551486186427596, + "learning_rate": 7.263370099279172e-05, + "loss": 1.1277, + "step": 501 + }, + { + "epoch": 1.2187120291616038, + "grad_norm": 44.12973085459368, + "learning_rate": 7.224752077517253e-05, + "loss": 1.1768, + "step": 502 + }, + { + "epoch": 1.2211421628189552, + "grad_norm": 84.84836585196132, + "learning_rate": 7.186178846413214e-05, + "loss": 1.1892, + "step": 503 + }, + { + "epoch": 1.2235722964763063, + "grad_norm": 34.94807915131505, + "learning_rate": 7.147651028513383e-05, + "loss": 1.1108, + "step": 504 + }, + { + "epoch": 1.2260024301336574, + "grad_norm": 46.19847384406232, + "learning_rate": 7.109169245631149e-05, + "loss": 1.0956, + "step": 505 + }, + { + "epoch": 1.2284325637910085, + "grad_norm": 38.58484473058957, + "learning_rate": 7.070734118836925e-05, + "loss": 1.1175, + "step": 506 + }, + { + "epoch": 1.2308626974483596, + "grad_norm": 37.84739298111386, + "learning_rate": 7.032346268448118e-05, + "loss": 1.1411, + "step": 507 + }, + { + "epoch": 1.2332928311057108, + "grad_norm": 53.5471335398439, + "learning_rate": 6.994006314019141e-05, + "loss": 1.1332, + "step": 508 + }, + { + "epoch": 1.2357229647630619, + "grad_norm": 91.55067777365485, + "learning_rate": 6.955714874331387e-05, + "loss": 1.1205, + "step": 509 + }, + { + "epoch": 1.2381530984204132, + "grad_norm": 27.05333642785952, + "learning_rate": 6.917472567383252e-05, + "loss": 1.099, + "step": 510 + }, + { + "epoch": 1.2405832320777643, + "grad_norm": 24.519879042487336, + "learning_rate": 6.87928001038017e-05, + "loss": 1.1401, + "step": 511 + }, + { + "epoch": 1.2430133657351154, + "grad_norm": 33.763495598365786, + "learning_rate": 6.84113781972464e-05, + "loss": 1.2058, + "step": 512 + }, + { + "epoch": 1.2454434993924666, + "grad_norm": 34.49114206138826, + "learning_rate": 6.803046611006278e-05, + "loss": 1.1044, + "step": 513 + }, + { + "epoch": 1.2478736330498177, + "grad_norm": 74.20211157975073, + "learning_rate": 6.765006998991888e-05, + "loss": 1.111, + "step": 514 + }, + { + "epoch": 1.250303766707169, + "grad_norm": 32.30436806042553, + "learning_rate": 6.727019597615545e-05, + "loss": 1.1063, + "step": 515 + }, + { + "epoch": 1.250303766707169, + "eval_loss": 1.1128273010253906, + "eval_runtime": 53.4998, + "eval_samples_per_second": 13.907, + "eval_steps_per_second": 1.738, + "step": 515 + }, + { + "epoch": 1.25273390036452, + "grad_norm": 42.104054612880084, + "learning_rate": 6.689085019968669e-05, + "loss": 1.1315, + "step": 516 + }, + { + "epoch": 1.2551640340218713, + "grad_norm": 25.66097714624212, + "learning_rate": 6.651203878290139e-05, + "loss": 1.0916, + "step": 517 + }, + { + "epoch": 1.2575941676792224, + "grad_norm": 35.12310576456352, + "learning_rate": 6.613376783956423e-05, + "loss": 1.0699, + "step": 518 + }, + { + "epoch": 1.2600243013365735, + "grad_norm": 34.172951559594566, + "learning_rate": 6.575604347471695e-05, + "loss": 1.1412, + "step": 519 + }, + { + "epoch": 1.2624544349939246, + "grad_norm": 54.373563773275116, + "learning_rate": 6.537887178457984e-05, + "loss": 1.1255, + "step": 520 + }, + { + "epoch": 1.2648845686512757, + "grad_norm": 33.806385046788755, + "learning_rate": 6.500225885645346e-05, + "loss": 1.101, + "step": 521 + }, + { + "epoch": 1.267314702308627, + "grad_norm": 34.17813695957543, + "learning_rate": 6.46262107686203e-05, + "loss": 1.1226, + "step": 522 + }, + { + "epoch": 1.2697448359659782, + "grad_norm": 24.68048087106548, + "learning_rate": 6.425073359024663e-05, + "loss": 1.1787, + "step": 523 + }, + { + "epoch": 1.2721749696233293, + "grad_norm": 32.78749757697808, + "learning_rate": 6.387583338128471e-05, + "loss": 1.0541, + "step": 524 + }, + { + "epoch": 1.2746051032806804, + "grad_norm": 30.906673844090044, + "learning_rate": 6.350151619237488e-05, + "loss": 1.0964, + "step": 525 + }, + { + "epoch": 1.2770352369380316, + "grad_norm": 32.571858392892736, + "learning_rate": 6.312778806474795e-05, + "loss": 1.1251, + "step": 526 + }, + { + "epoch": 1.2794653705953827, + "grad_norm": 43.02428916532565, + "learning_rate": 6.275465503012751e-05, + "loss": 1.0473, + "step": 527 + }, + { + "epoch": 1.2818955042527338, + "grad_norm": 60.93587506764561, + "learning_rate": 6.2382123110633e-05, + "loss": 1.078, + "step": 528 + }, + { + "epoch": 1.2843256379100851, + "grad_norm": 64.6934775930251, + "learning_rate": 6.201019831868208e-05, + "loss": 1.0904, + "step": 529 + }, + { + "epoch": 1.2867557715674363, + "grad_norm": 32.977077613035426, + "learning_rate": 6.16388866568938e-05, + "loss": 1.0705, + "step": 530 + }, + { + "epoch": 1.2891859052247874, + "grad_norm": 28.27407310492513, + "learning_rate": 6.126819411799175e-05, + "loss": 1.1252, + "step": 531 + }, + { + "epoch": 1.2916160388821385, + "grad_norm": 33.73515826089828, + "learning_rate": 6.0898126684707265e-05, + "loss": 1.1262, + "step": 532 + }, + { + "epoch": 1.2940461725394896, + "grad_norm": 25.370361818959903, + "learning_rate": 6.052869032968285e-05, + "loss": 1.0845, + "step": 533 + }, + { + "epoch": 1.296476306196841, + "grad_norm": 37.389287060597105, + "learning_rate": 6.015989101537586e-05, + "loss": 1.1352, + "step": 534 + }, + { + "epoch": 1.2989064398541919, + "grad_norm": 39.04755104008223, + "learning_rate": 5.979173469396227e-05, + "loss": 1.1538, + "step": 535 + }, + { + "epoch": 1.3013365735115432, + "grad_norm": 34.33676719612293, + "learning_rate": 5.9424227307240554e-05, + "loss": 1.1725, + "step": 536 + }, + { + "epoch": 1.3037667071688943, + "grad_norm": 64.66076997769457, + "learning_rate": 5.905737478653572e-05, + "loss": 1.1146, + "step": 537 + }, + { + "epoch": 1.3061968408262454, + "grad_norm": 48.043289790386325, + "learning_rate": 5.8691183052603834e-05, + "loss": 1.1035, + "step": 538 + }, + { + "epoch": 1.3086269744835966, + "grad_norm": 49.08397341659928, + "learning_rate": 5.83256580155362e-05, + "loss": 1.1653, + "step": 539 + }, + { + "epoch": 1.3110571081409477, + "grad_norm": 46.688886812303515, + "learning_rate": 5.796080557466406e-05, + "loss": 1.1328, + "step": 540 + }, + { + "epoch": 1.313487241798299, + "grad_norm": 27.503882325413493, + "learning_rate": 5.7596631618463514e-05, + "loss": 1.1019, + "step": 541 + }, + { + "epoch": 1.3159173754556501, + "grad_norm": 48.88974129574653, + "learning_rate": 5.723314202446026e-05, + "loss": 1.121, + "step": 542 + }, + { + "epoch": 1.3183475091130012, + "grad_norm": 28.105881157995345, + "learning_rate": 5.687034265913485e-05, + "loss": 1.0898, + "step": 543 + }, + { + "epoch": 1.3207776427703524, + "grad_norm": 30.410731278414804, + "learning_rate": 5.6508239377828034e-05, + "loss": 1.07, + "step": 544 + }, + { + "epoch": 1.3232077764277035, + "grad_norm": 38.08324176765882, + "learning_rate": 5.614683802464631e-05, + "loss": 1.1503, + "step": 545 + }, + { + "epoch": 1.3256379100850546, + "grad_norm": 46.28952293745534, + "learning_rate": 5.578614443236738e-05, + "loss": 1.1282, + "step": 546 + }, + { + "epoch": 1.3280680437424057, + "grad_norm": 68.2597453597135, + "learning_rate": 5.542616442234618e-05, + "loss": 1.1373, + "step": 547 + }, + { + "epoch": 1.330498177399757, + "grad_norm": 30.351663825014143, + "learning_rate": 5.5066903804421025e-05, + "loss": 1.1633, + "step": 548 + }, + { + "epoch": 1.3329283110571082, + "grad_norm": 38.2711285636887, + "learning_rate": 5.470836837681954e-05, + "loss": 1.1604, + "step": 549 + }, + { + "epoch": 1.3353584447144593, + "grad_norm": 35.64230091531108, + "learning_rate": 5.4350563926065404e-05, + "loss": 1.0564, + "step": 550 + }, + { + "epoch": 1.3377885783718104, + "grad_norm": 44.869816046925564, + "learning_rate": 5.399349622688479e-05, + "loss": 1.1376, + "step": 551 + }, + { + "epoch": 1.3402187120291615, + "grad_norm": 26.681037126315633, + "learning_rate": 5.3637171042113146e-05, + "loss": 1.0867, + "step": 552 + }, + { + "epoch": 1.3426488456865129, + "grad_norm": 34.6124686262535, + "learning_rate": 5.32815941226022e-05, + "loss": 1.0474, + "step": 553 + }, + { + "epoch": 1.3450789793438638, + "grad_norm": 35.92639009060983, + "learning_rate": 5.2926771207127254e-05, + "loss": 1.0958, + "step": 554 + }, + { + "epoch": 1.3475091130012151, + "grad_norm": 39.08938922562224, + "learning_rate": 5.2572708022294504e-05, + "loss": 1.074, + "step": 555 + }, + { + "epoch": 1.3499392466585662, + "grad_norm": 76.06708166273745, + "learning_rate": 5.2219410282448514e-05, + "loss": 1.0865, + "step": 556 + }, + { + "epoch": 1.3523693803159174, + "grad_norm": 74.14222265654887, + "learning_rate": 5.1866883689580056e-05, + "loss": 1.1567, + "step": 557 + }, + { + "epoch": 1.3547995139732685, + "grad_norm": 34.82441678662901, + "learning_rate": 5.151513393323426e-05, + "loss": 1.0802, + "step": 558 + }, + { + "epoch": 1.3572296476306196, + "grad_norm": 75.53504846566143, + "learning_rate": 5.116416669041843e-05, + "loss": 1.0623, + "step": 559 + }, + { + "epoch": 1.359659781287971, + "grad_norm": 29.423475817434785, + "learning_rate": 5.0813987625510775e-05, + "loss": 1.077, + "step": 560 + }, + { + "epoch": 1.362089914945322, + "grad_norm": 44.607486168434534, + "learning_rate": 5.046460239016879e-05, + "loss": 1.096, + "step": 561 + }, + { + "epoch": 1.3645200486026732, + "grad_norm": 40.684125033315404, + "learning_rate": 5.011601662323807e-05, + "loss": 1.148, + "step": 562 + }, + { + "epoch": 1.3669501822600243, + "grad_norm": 47.33103026318705, + "learning_rate": 4.976823595066128e-05, + "loss": 1.1712, + "step": 563 + }, + { + "epoch": 1.3693803159173754, + "grad_norm": 51.17017845058186, + "learning_rate": 4.9421265985387476e-05, + "loss": 1.1287, + "step": 564 + }, + { + "epoch": 1.3718104495747265, + "grad_norm": 50.76665552103517, + "learning_rate": 4.907511232728145e-05, + "loss": 1.1156, + "step": 565 + }, + { + "epoch": 1.3742405832320777, + "grad_norm": 32.6007633025874, + "learning_rate": 4.872978056303327e-05, + "loss": 1.1477, + "step": 566 + }, + { + "epoch": 1.376670716889429, + "grad_norm": 29.696241441710107, + "learning_rate": 4.8385276266068146e-05, + "loss": 1.0874, + "step": 567 + }, + { + "epoch": 1.37910085054678, + "grad_norm": 58.96613500379004, + "learning_rate": 4.804160499645667e-05, + "loss": 1.0616, + "step": 568 + }, + { + "epoch": 1.3815309842041312, + "grad_norm": 37.104100020310334, + "learning_rate": 4.7698772300824756e-05, + "loss": 1.0878, + "step": 569 + }, + { + "epoch": 1.3839611178614823, + "grad_norm": 51.735902941979305, + "learning_rate": 4.735678371226441e-05, + "loss": 1.0836, + "step": 570 + }, + { + "epoch": 1.3863912515188335, + "grad_norm": 55.49190976804079, + "learning_rate": 4.7015644750244306e-05, + "loss": 1.0473, + "step": 571 + }, + { + "epoch": 1.3888213851761848, + "grad_norm": 34.27972449829039, + "learning_rate": 4.6675360920520625e-05, + "loss": 1.0723, + "step": 572 + }, + { + "epoch": 1.391251518833536, + "grad_norm": 28.508157856527724, + "learning_rate": 4.6335937715048306e-05, + "loss": 1.0723, + "step": 573 + }, + { + "epoch": 1.393681652490887, + "grad_norm": 106.84009565003795, + "learning_rate": 4.599738061189244e-05, + "loss": 1.149, + "step": 574 + }, + { + "epoch": 1.3961117861482382, + "grad_norm": 50.543394606036294, + "learning_rate": 4.565969507513981e-05, + "loss": 1.0991, + "step": 575 + }, + { + "epoch": 1.3985419198055893, + "grad_norm": 30.409124335052745, + "learning_rate": 4.532288655481062e-05, + "loss": 1.1157, + "step": 576 + }, + { + "epoch": 1.4009720534629404, + "grad_norm": 89.92061876679301, + "learning_rate": 4.498696048677059e-05, + "loss": 1.1526, + "step": 577 + }, + { + "epoch": 1.4034021871202915, + "grad_norm": 84.27775422110602, + "learning_rate": 4.465192229264337e-05, + "loss": 1.1418, + "step": 578 + }, + { + "epoch": 1.4058323207776429, + "grad_norm": 40.7815489623743, + "learning_rate": 4.4317777379722866e-05, + "loss": 1.0831, + "step": 579 + }, + { + "epoch": 1.408262454434994, + "grad_norm": 66.6911504313278, + "learning_rate": 4.3984531140885943e-05, + "loss": 1.1088, + "step": 580 + }, + { + "epoch": 1.410692588092345, + "grad_norm": 137.00882181835217, + "learning_rate": 4.365218895450558e-05, + "loss": 1.1089, + "step": 581 + }, + { + "epoch": 1.4131227217496962, + "grad_norm": 41.139168895296855, + "learning_rate": 4.332075618436386e-05, + "loss": 1.1603, + "step": 582 + }, + { + "epoch": 1.4155528554070473, + "grad_norm": 35.443969765428506, + "learning_rate": 4.29902381795655e-05, + "loss": 1.0301, + "step": 583 + }, + { + "epoch": 1.4179829890643987, + "grad_norm": 32.931514576694674, + "learning_rate": 4.266064027445155e-05, + "loss": 1.1016, + "step": 584 + }, + { + "epoch": 1.4204131227217496, + "grad_norm": 64.21015694858382, + "learning_rate": 4.2331967788513295e-05, + "loss": 1.0789, + "step": 585 + }, + { + "epoch": 1.422843256379101, + "grad_norm": 84.13251752827094, + "learning_rate": 4.200422602630629e-05, + "loss": 1.1573, + "step": 586 + }, + { + "epoch": 1.425273390036452, + "grad_norm": 53.61636603108024, + "learning_rate": 4.167742027736482e-05, + "loss": 1.0942, + "step": 587 + }, + { + "epoch": 1.4277035236938032, + "grad_norm": 133.20877569415256, + "learning_rate": 4.135155581611661e-05, + "loss": 1.0877, + "step": 588 + }, + { + "epoch": 1.4301336573511543, + "grad_norm": 49.85736467319357, + "learning_rate": 4.102663790179764e-05, + "loss": 1.0619, + "step": 589 + }, + { + "epoch": 1.4325637910085054, + "grad_norm": 91.13217639524017, + "learning_rate": 4.070267177836712e-05, + "loss": 1.1093, + "step": 590 + }, + { + "epoch": 1.4349939246658567, + "grad_norm": 49.25558128250457, + "learning_rate": 4.037966267442315e-05, + "loss": 1.1344, + "step": 591 + }, + { + "epoch": 1.4374240583232079, + "grad_norm": 95.87244356130316, + "learning_rate": 4.005761580311805e-05, + "loss": 1.0929, + "step": 592 + }, + { + "epoch": 1.439854191980559, + "grad_norm": 74.28903671045653, + "learning_rate": 3.973653636207437e-05, + "loss": 1.1263, + "step": 593 + }, + { + "epoch": 1.44228432563791, + "grad_norm": 53.99454529785116, + "learning_rate": 3.941642953330103e-05, + "loss": 1.0916, + "step": 594 + }, + { + "epoch": 1.4447144592952612, + "grad_norm": 113.26015597338959, + "learning_rate": 3.909730048310962e-05, + "loss": 1.1009, + "step": 595 + }, + { + "epoch": 1.4471445929526123, + "grad_norm": 134.4015550981493, + "learning_rate": 3.8779154362030986e-05, + "loss": 1.1351, + "step": 596 + }, + { + "epoch": 1.4495747266099634, + "grad_norm": 90.61611981238187, + "learning_rate": 3.846199630473216e-05, + "loss": 1.0827, + "step": 597 + }, + { + "epoch": 1.4520048602673148, + "grad_norm": 56.55050791518521, + "learning_rate": 3.814583142993352e-05, + "loss": 1.1145, + "step": 598 + }, + { + "epoch": 1.454434993924666, + "grad_norm": 265.6916535243014, + "learning_rate": 3.7830664840326145e-05, + "loss": 1.1459, + "step": 599 + }, + { + "epoch": 1.456865127582017, + "grad_norm": 72.81191101030372, + "learning_rate": 3.7516501622489367e-05, + "loss": 1.0903, + "step": 600 + }, + { + "epoch": 1.4592952612393681, + "grad_norm": 58.309143549086556, + "learning_rate": 3.720334684680889e-05, + "loss": 1.1041, + "step": 601 + }, + { + "epoch": 1.4617253948967193, + "grad_norm": 35.19205741792398, + "learning_rate": 3.689120556739475e-05, + "loss": 1.1523, + "step": 602 + }, + { + "epoch": 1.4641555285540706, + "grad_norm": 88.97226951757321, + "learning_rate": 3.6580082821999786e-05, + "loss": 1.1117, + "step": 603 + }, + { + "epoch": 1.4665856622114215, + "grad_norm": 64.50873879301322, + "learning_rate": 3.6269983631938475e-05, + "loss": 1.1256, + "step": 604 + }, + { + "epoch": 1.4690157958687728, + "grad_norm": 78.10556611104111, + "learning_rate": 3.596091300200578e-05, + "loss": 1.0834, + "step": 605 + }, + { + "epoch": 1.471445929526124, + "grad_norm": 69.38449946362529, + "learning_rate": 3.565287592039628e-05, + "loss": 1.1026, + "step": 606 + }, + { + "epoch": 1.473876063183475, + "grad_norm": 79.60241521456905, + "learning_rate": 3.534587735862391e-05, + "loss": 1.0456, + "step": 607 + }, + { + "epoch": 1.4763061968408262, + "grad_norm": 89.68581306071424, + "learning_rate": 3.503992227144147e-05, + "loss": 1.0809, + "step": 608 + }, + { + "epoch": 1.4787363304981773, + "grad_norm": 68.570527237558, + "learning_rate": 3.473501559676088e-05, + "loss": 1.0754, + "step": 609 + }, + { + "epoch": 1.4811664641555287, + "grad_norm": 54.94762317625427, + "learning_rate": 3.4431162255573245e-05, + "loss": 1.1751, + "step": 610 + }, + { + "epoch": 1.4835965978128798, + "grad_norm": 109.12821602719706, + "learning_rate": 3.4128367151869714e-05, + "loss": 1.1055, + "step": 611 + }, + { + "epoch": 1.486026731470231, + "grad_norm": 198.79030469542352, + "learning_rate": 3.3826635172562094e-05, + "loss": 1.1369, + "step": 612 + }, + { + "epoch": 1.488456865127582, + "grad_norm": 62.002866716809, + "learning_rate": 3.352597118740404e-05, + "loss": 1.1611, + "step": 613 + }, + { + "epoch": 1.4908869987849331, + "grad_norm": 79.21193137029579, + "learning_rate": 3.3226380048912585e-05, + "loss": 1.1688, + "step": 614 + }, + { + "epoch": 1.4933171324422843, + "grad_norm": 68.6722934326242, + "learning_rate": 3.292786659228973e-05, + "loss": 1.1248, + "step": 615 + }, + { + "epoch": 1.4957472660996354, + "grad_norm": 104.34122241838278, + "learning_rate": 3.263043563534428e-05, + "loss": 1.1425, + "step": 616 + }, + { + "epoch": 1.4981773997569867, + "grad_norm": 86.43862038340298, + "learning_rate": 3.233409197841437e-05, + "loss": 1.0562, + "step": 617 + }, + { + "epoch": 1.5006075334143378, + "grad_norm": 79.74137751394451, + "learning_rate": 3.2038840404289705e-05, + "loss": 1.1214, + "step": 618 + }, + { + "epoch": 1.5006075334143378, + "eval_loss": 1.1088899374008179, + "eval_runtime": 53.0545, + "eval_samples_per_second": 14.023, + "eval_steps_per_second": 1.753, + "step": 618 + }, + { + "epoch": 1.503037667071689, + "grad_norm": 126.19650708566132, + "learning_rate": 3.174468567813461e-05, + "loss": 1.181, + "step": 619 + }, + { + "epoch": 1.50546780072904, + "grad_norm": 64.86293986153461, + "learning_rate": 3.14516325474109e-05, + "loss": 1.0607, + "step": 620 + }, + { + "epoch": 1.5078979343863912, + "grad_norm": 62.06308896160908, + "learning_rate": 3.115968574180149e-05, + "loss": 1.0914, + "step": 621 + }, + { + "epoch": 1.5103280680437425, + "grad_norm": 168.27548636755165, + "learning_rate": 3.086884997313387e-05, + "loss": 1.1595, + "step": 622 + }, + { + "epoch": 1.5127582017010934, + "grad_norm": 156.46495738513647, + "learning_rate": 3.0579129935304066e-05, + "loss": 1.1263, + "step": 623 + }, + { + "epoch": 1.5151883353584448, + "grad_norm": 71.761765760571, + "learning_rate": 3.029053030420115e-05, + "loss": 1.049, + "step": 624 + }, + { + "epoch": 1.517618469015796, + "grad_norm": 87.26870047585324, + "learning_rate": 3.0003055737631403e-05, + "loss": 1.1917, + "step": 625 + }, + { + "epoch": 1.520048602673147, + "grad_norm": 142.01139847883954, + "learning_rate": 2.9716710875243326e-05, + "loss": 1.1038, + "step": 626 + }, + { + "epoch": 1.5224787363304981, + "grad_norm": 81.15254185021365, + "learning_rate": 2.9431500338452832e-05, + "loss": 1.0824, + "step": 627 + }, + { + "epoch": 1.5249088699878492, + "grad_norm": 68.21138775878333, + "learning_rate": 2.9147428730368475e-05, + "loss": 1.0676, + "step": 628 + }, + { + "epoch": 1.5273390036452006, + "grad_norm": 61.929977077152344, + "learning_rate": 2.886450063571735e-05, + "loss": 1.1928, + "step": 629 + }, + { + "epoch": 1.5297691373025515, + "grad_norm": 76.19248167649229, + "learning_rate": 2.858272062077091e-05, + "loss": 1.0737, + "step": 630 + }, + { + "epoch": 1.5321992709599028, + "grad_norm": 67.40817795826194, + "learning_rate": 2.8302093233271453e-05, + "loss": 1.0734, + "step": 631 + }, + { + "epoch": 1.534629404617254, + "grad_norm": 35.17352084915858, + "learning_rate": 2.802262300235857e-05, + "loss": 1.0062, + "step": 632 + }, + { + "epoch": 1.537059538274605, + "grad_norm": 97.0705094618675, + "learning_rate": 2.7744314438496088e-05, + "loss": 1.121, + "step": 633 + }, + { + "epoch": 1.5394896719319564, + "grad_norm": 52.21457659022329, + "learning_rate": 2.7467172033399458e-05, + "loss": 1.1864, + "step": 634 + }, + { + "epoch": 1.5419198055893073, + "grad_norm": 260.1057846866782, + "learning_rate": 2.7191200259962934e-05, + "loss": 1.1549, + "step": 635 + }, + { + "epoch": 1.5443499392466586, + "grad_norm": 66.65086231184844, + "learning_rate": 2.691640357218759e-05, + "loss": 1.1023, + "step": 636 + }, + { + "epoch": 1.5467800729040098, + "grad_norm": 680.8791021196618, + "learning_rate": 2.6642786405109475e-05, + "loss": 1.0943, + "step": 637 + }, + { + "epoch": 1.5492102065613609, + "grad_norm": 36.199872792671414, + "learning_rate": 2.6370353174727836e-05, + "loss": 1.0924, + "step": 638 + }, + { + "epoch": 1.551640340218712, + "grad_norm": 84.1148767833362, + "learning_rate": 2.6099108277934103e-05, + "loss": 1.1361, + "step": 639 + }, + { + "epoch": 1.5540704738760631, + "grad_norm": 81.84432345021693, + "learning_rate": 2.5829056092440662e-05, + "loss": 1.0868, + "step": 640 + }, + { + "epoch": 1.5565006075334145, + "grad_norm": 39.42683610456025, + "learning_rate": 2.556020097671046e-05, + "loss": 1.1506, + "step": 641 + }, + { + "epoch": 1.5589307411907654, + "grad_norm": 54.33249421192736, + "learning_rate": 2.5292547269886392e-05, + "loss": 1.0517, + "step": 642 + }, + { + "epoch": 1.5613608748481167, + "grad_norm": 410.5903072488164, + "learning_rate": 2.5026099291721516e-05, + "loss": 1.0995, + "step": 643 + }, + { + "epoch": 1.5637910085054678, + "grad_norm": 83.574545998207, + "learning_rate": 2.4760861342509233e-05, + "loss": 1.0792, + "step": 644 + }, + { + "epoch": 1.566221142162819, + "grad_norm": 399.66181496308434, + "learning_rate": 2.449683770301382e-05, + "loss": 1.2167, + "step": 645 + }, + { + "epoch": 1.56865127582017, + "grad_norm": 55.12309263364805, + "learning_rate": 2.4234032634401406e-05, + "loss": 1.0332, + "step": 646 + }, + { + "epoch": 1.5710814094775212, + "grad_norm": 61.30588953316776, + "learning_rate": 2.397245037817125e-05, + "loss": 1.0659, + "step": 647 + }, + { + "epoch": 1.5735115431348725, + "grad_norm": 75.74467195338701, + "learning_rate": 2.371209515608718e-05, + "loss": 1.1254, + "step": 648 + }, + { + "epoch": 1.5759416767922234, + "grad_norm": 67.98309962901806, + "learning_rate": 2.345297117010954e-05, + "loss": 1.1119, + "step": 649 + }, + { + "epoch": 1.5783718104495748, + "grad_norm": 59.08178521357814, + "learning_rate": 2.3195082602327312e-05, + "loss": 1.0866, + "step": 650 + }, + { + "epoch": 1.5808019441069259, + "grad_norm": 94.26571313695092, + "learning_rate": 2.2938433614890697e-05, + "loss": 1.1742, + "step": 651 + }, + { + "epoch": 1.583232077764277, + "grad_norm": 92.74387959878898, + "learning_rate": 2.2683028349943815e-05, + "loss": 1.1765, + "step": 652 + }, + { + "epoch": 1.5856622114216283, + "grad_norm": 54.0790750014235, + "learning_rate": 2.242887092955801e-05, + "loss": 1.0979, + "step": 653 + }, + { + "epoch": 1.5880923450789792, + "grad_norm": 55.72195824432094, + "learning_rate": 2.2175965455665226e-05, + "loss": 1.0826, + "step": 654 + }, + { + "epoch": 1.5905224787363306, + "grad_norm": 60.8162820416134, + "learning_rate": 2.1924316009991787e-05, + "loss": 1.0884, + "step": 655 + }, + { + "epoch": 1.5929526123936817, + "grad_norm": 67.20621804796278, + "learning_rate": 2.167392665399256e-05, + "loss": 1.1426, + "step": 656 + }, + { + "epoch": 1.5953827460510328, + "grad_norm": 63.50889552696206, + "learning_rate": 2.1424801428785447e-05, + "loss": 1.1819, + "step": 657 + }, + { + "epoch": 1.597812879708384, + "grad_norm": 60.34121097929382, + "learning_rate": 2.1176944355086058e-05, + "loss": 1.1051, + "step": 658 + }, + { + "epoch": 1.600243013365735, + "grad_norm": 91.95807405182529, + "learning_rate": 2.0930359433142932e-05, + "loss": 1.0768, + "step": 659 + }, + { + "epoch": 1.6026731470230864, + "grad_norm": 33.84817514299781, + "learning_rate": 2.068505064267292e-05, + "loss": 1.1556, + "step": 660 + }, + { + "epoch": 1.6051032806804373, + "grad_norm": 44.846129252871364, + "learning_rate": 2.0441021942796944e-05, + "loss": 1.192, + "step": 661 + }, + { + "epoch": 1.6075334143377886, + "grad_norm": 104.85494442468764, + "learning_rate": 2.0198277271976052e-05, + "loss": 1.1912, + "step": 662 + }, + { + "epoch": 1.6099635479951397, + "grad_norm": 59.541562510020924, + "learning_rate": 1.995682054794803e-05, + "loss": 1.0932, + "step": 663 + }, + { + "epoch": 1.6123936816524909, + "grad_norm": 57.73876590809742, + "learning_rate": 1.9716655667664008e-05, + "loss": 1.1691, + "step": 664 + }, + { + "epoch": 1.6148238153098422, + "grad_norm": 37.00550106127363, + "learning_rate": 1.9477786507225616e-05, + "loss": 1.0974, + "step": 665 + }, + { + "epoch": 1.617253948967193, + "grad_norm": 271.6238263663105, + "learning_rate": 1.924021692182236e-05, + "loss": 1.1196, + "step": 666 + }, + { + "epoch": 1.6196840826245444, + "grad_norm": 69.94535819115217, + "learning_rate": 1.900395074566962e-05, + "loss": 1.1219, + "step": 667 + }, + { + "epoch": 1.6221142162818953, + "grad_norm": 64.77937566314249, + "learning_rate": 1.8768991791946456e-05, + "loss": 1.0457, + "step": 668 + }, + { + "epoch": 1.6245443499392467, + "grad_norm": 91.1799572658908, + "learning_rate": 1.8535343852734332e-05, + "loss": 1.1082, + "step": 669 + }, + { + "epoch": 1.6269744835965978, + "grad_norm": 140.3320781032681, + "learning_rate": 1.8303010698955804e-05, + "loss": 1.1587, + "step": 670 + }, + { + "epoch": 1.629404617253949, + "grad_norm": 129.9206563142473, + "learning_rate": 1.8071996080313602e-05, + "loss": 1.0436, + "step": 671 + }, + { + "epoch": 1.6318347509113003, + "grad_norm": 57.52355335064491, + "learning_rate": 1.784230372523018e-05, + "loss": 1.0777, + "step": 672 + }, + { + "epoch": 1.6342648845686512, + "grad_norm": 45.59691137086442, + "learning_rate": 1.76139373407876e-05, + "loss": 1.1133, + "step": 673 + }, + { + "epoch": 1.6366950182260025, + "grad_norm": 174.9829716096277, + "learning_rate": 1.7386900612667633e-05, + "loss": 1.1704, + "step": 674 + }, + { + "epoch": 1.6391251518833536, + "grad_norm": 106.67575565748977, + "learning_rate": 1.7161197205092216e-05, + "loss": 1.108, + "step": 675 + }, + { + "epoch": 1.6415552855407047, + "grad_norm": 80.2118578939736, + "learning_rate": 1.69368307607644e-05, + "loss": 1.1134, + "step": 676 + }, + { + "epoch": 1.6439854191980559, + "grad_norm": 50.075694613199865, + "learning_rate": 1.6713804900809582e-05, + "loss": 1.103, + "step": 677 + }, + { + "epoch": 1.646415552855407, + "grad_norm": 69.23038320811604, + "learning_rate": 1.649212322471695e-05, + "loss": 1.1189, + "step": 678 + }, + { + "epoch": 1.6488456865127583, + "grad_norm": 33.2935221457007, + "learning_rate": 1.6271789310281517e-05, + "loss": 1.0763, + "step": 679 + }, + { + "epoch": 1.6512758201701092, + "grad_norm": 74.75507124872362, + "learning_rate": 1.605280671354632e-05, + "loss": 1.0983, + "step": 680 + }, + { + "epoch": 1.6537059538274606, + "grad_norm": 72.6880045095337, + "learning_rate": 1.583517896874498e-05, + "loss": 1.1151, + "step": 681 + }, + { + "epoch": 1.6561360874848117, + "grad_norm": 59.70666181469054, + "learning_rate": 1.561890958824469e-05, + "loss": 1.1202, + "step": 682 + }, + { + "epoch": 1.6585662211421628, + "grad_norm": 136.06883726877848, + "learning_rate": 1.540400206248963e-05, + "loss": 1.114, + "step": 683 + }, + { + "epoch": 1.6609963547995141, + "grad_norm": 48.25877797639542, + "learning_rate": 1.5190459859944505e-05, + "loss": 1.0926, + "step": 684 + }, + { + "epoch": 1.663426488456865, + "grad_norm": 99.27065031977625, + "learning_rate": 1.4978286427038601e-05, + "loss": 1.0938, + "step": 685 + }, + { + "epoch": 1.6658566221142164, + "grad_norm": 73.70604863380417, + "learning_rate": 1.4767485188110152e-05, + "loss": 1.0955, + "step": 686 + }, + { + "epoch": 1.6682867557715675, + "grad_norm": 97.29634642853938, + "learning_rate": 1.4558059545351143e-05, + "loss": 1.0993, + "step": 687 + }, + { + "epoch": 1.6707168894289186, + "grad_norm": 169.33237029052367, + "learning_rate": 1.435001287875234e-05, + "loss": 1.1484, + "step": 688 + }, + { + "epoch": 1.6731470230862697, + "grad_norm": 51.080335246500006, + "learning_rate": 1.4143348546048707e-05, + "loss": 1.1279, + "step": 689 + }, + { + "epoch": 1.6755771567436208, + "grad_norm": 123.74332262351422, + "learning_rate": 1.3938069882665328e-05, + "loss": 1.144, + "step": 690 + }, + { + "epoch": 1.6780072904009722, + "grad_norm": 150.6264388349919, + "learning_rate": 1.3734180201663439e-05, + "loss": 1.048, + "step": 691 + }, + { + "epoch": 1.680437424058323, + "grad_norm": 45.78978589208615, + "learning_rate": 1.3531682793687028e-05, + "loss": 1.0943, + "step": 692 + }, + { + "epoch": 1.6828675577156744, + "grad_norm": 59.23541668296553, + "learning_rate": 1.3330580926909763e-05, + "loss": 1.1422, + "step": 693 + }, + { + "epoch": 1.6852976913730255, + "grad_norm": 83.37564839198684, + "learning_rate": 1.3130877846982204e-05, + "loss": 1.1167, + "step": 694 + }, + { + "epoch": 1.6877278250303767, + "grad_norm": 169.89181363126755, + "learning_rate": 1.2932576776979377e-05, + "loss": 1.0153, + "step": 695 + }, + { + "epoch": 1.6901579586877278, + "grad_norm": 41.65359342112402, + "learning_rate": 1.2735680917348802e-05, + "loss": 1.0842, + "step": 696 + }, + { + "epoch": 1.692588092345079, + "grad_norm": 91.76072613046553, + "learning_rate": 1.2540193445858883e-05, + "loss": 1.1274, + "step": 697 + }, + { + "epoch": 1.6950182260024302, + "grad_norm": 86.16989165645253, + "learning_rate": 1.2346117517547551e-05, + "loss": 1.106, + "step": 698 + }, + { + "epoch": 1.6974483596597811, + "grad_norm": 75.86627467070798, + "learning_rate": 1.2153456264671337e-05, + "loss": 1.0642, + "step": 699 + }, + { + "epoch": 1.6998784933171325, + "grad_norm": 78.47579727138226, + "learning_rate": 1.1962212796654926e-05, + "loss": 1.053, + "step": 700 + }, + { + "epoch": 1.7023086269744836, + "grad_norm": 81.45952046323904, + "learning_rate": 1.1772390200040817e-05, + "loss": 1.1003, + "step": 701 + }, + { + "epoch": 1.7047387606318347, + "grad_norm": 81.5215081559605, + "learning_rate": 1.1583991538439598e-05, + "loss": 1.0789, + "step": 702 + }, + { + "epoch": 1.707168894289186, + "grad_norm": 123.8954411953181, + "learning_rate": 1.139701985248055e-05, + "loss": 1.0574, + "step": 703 + }, + { + "epoch": 1.709599027946537, + "grad_norm": 66.51876171521589, + "learning_rate": 1.1211478159762478e-05, + "loss": 1.0866, + "step": 704 + }, + { + "epoch": 1.7120291616038883, + "grad_norm": 88.7505135509034, + "learning_rate": 1.1027369454805058e-05, + "loss": 1.1039, + "step": 705 + }, + { + "epoch": 1.7144592952612394, + "grad_norm": 51.948320911337355, + "learning_rate": 1.0844696709000435e-05, + "loss": 1.0891, + "step": 706 + }, + { + "epoch": 1.7168894289185905, + "grad_norm": 116.12502404263041, + "learning_rate": 1.0663462870565411e-05, + "loss": 1.1284, + "step": 707 + }, + { + "epoch": 1.7193195625759417, + "grad_norm": 49.752442053177056, + "learning_rate": 1.0483670864493778e-05, + "loss": 1.11, + "step": 708 + }, + { + "epoch": 1.7217496962332928, + "grad_norm": 89.67691421405478, + "learning_rate": 1.0305323592509009e-05, + "loss": 1.1504, + "step": 709 + }, + { + "epoch": 1.7241798298906441, + "grad_norm": 84.9951363796106, + "learning_rate": 1.0128423933017671e-05, + "loss": 1.1163, + "step": 710 + }, + { + "epoch": 1.726609963547995, + "grad_norm": 53.83015858877197, + "learning_rate": 9.952974741062703e-06, + "loss": 1.0768, + "step": 711 + }, + { + "epoch": 1.7290400972053463, + "grad_norm": 87.01137462153444, + "learning_rate": 9.77897884827752e-06, + "loss": 1.0505, + "step": 712 + }, + { + "epoch": 1.7314702308626975, + "grad_norm": 119.85348125427905, + "learning_rate": 9.606439062840256e-06, + "loss": 1.1866, + "step": 713 + }, + { + "epoch": 1.7339003645200486, + "grad_norm": 38.86482306830089, + "learning_rate": 9.435358169428442e-06, + "loss": 1.1203, + "step": 714 + }, + { + "epoch": 1.7363304981773997, + "grad_norm": 105.47836599222568, + "learning_rate": 9.265738929174051e-06, + "loss": 1.1219, + "step": 715 + }, + { + "epoch": 1.7387606318347508, + "grad_norm": 97.01504953945435, + "learning_rate": 9.097584079618893e-06, + "loss": 1.0897, + "step": 716 + }, + { + "epoch": 1.7411907654921022, + "grad_norm": 55.37203351389315, + "learning_rate": 8.93089633467058e-06, + "loss": 1.0747, + "step": 717 + }, + { + "epoch": 1.743620899149453, + "grad_norm": 53.68546468478919, + "learning_rate": 8.765678384558607e-06, + "loss": 1.0636, + "step": 718 + }, + { + "epoch": 1.7460510328068044, + "grad_norm": 93.22850661983693, + "learning_rate": 8.601932895790877e-06, + "loss": 1.0801, + "step": 719 + }, + { + "epoch": 1.7484811664641555, + "grad_norm": 75.10018201630282, + "learning_rate": 8.439662511110847e-06, + "loss": 1.1608, + "step": 720 + }, + { + "epoch": 1.7509113001215066, + "grad_norm": 75.88601313663253, + "learning_rate": 8.278869849454718e-06, + "loss": 1.0286, + "step": 721 + }, + { + "epoch": 1.7509113001215066, + "eval_loss": 1.1075224876403809, + "eval_runtime": 53.2869, + "eval_samples_per_second": 13.962, + "eval_steps_per_second": 1.745, + "step": 721 + }, + { + "epoch": 1.753341433778858, + "grad_norm": 75.94291636970333, + "learning_rate": 8.119557505909215e-06, + "loss": 1.1615, + "step": 722 + }, + { + "epoch": 1.7557715674362089, + "grad_norm": 85.61204534745477, + "learning_rate": 7.961728051669737e-06, + "loss": 1.1312, + "step": 723 + }, + { + "epoch": 1.7582017010935602, + "grad_norm": 42.0496338509614, + "learning_rate": 7.805384033998875e-06, + "loss": 1.1068, + "step": 724 + }, + { + "epoch": 1.7606318347509113, + "grad_norm": 67.9823900081791, + "learning_rate": 7.650527976185173e-06, + "loss": 1.134, + "step": 725 + }, + { + "epoch": 1.7630619684082625, + "grad_norm": 50.797982181202315, + "learning_rate": 7.497162377502542e-06, + "loss": 1.0903, + "step": 726 + }, + { + "epoch": 1.7654921020656136, + "grad_norm": 66.34495889496102, + "learning_rate": 7.3452897131698564e-06, + "loss": 1.0895, + "step": 727 + }, + { + "epoch": 1.7679222357229647, + "grad_norm": 97.21072984563654, + "learning_rate": 7.194912434311052e-06, + "loss": 1.0891, + "step": 728 + }, + { + "epoch": 1.770352369380316, + "grad_norm": 153.67433901334545, + "learning_rate": 7.046032967915483e-06, + "loss": 1.1057, + "step": 729 + }, + { + "epoch": 1.772782503037667, + "grad_norm": 65.34101790074203, + "learning_rate": 6.898653716798887e-06, + "loss": 1.1252, + "step": 730 + }, + { + "epoch": 1.7752126366950183, + "grad_norm": 60.35832905175029, + "learning_rate": 6.75277705956443e-06, + "loss": 1.1177, + "step": 731 + }, + { + "epoch": 1.7776427703523694, + "grad_norm": 47.338317641259096, + "learning_rate": 6.60840535056445e-06, + "loss": 1.0986, + "step": 732 + }, + { + "epoch": 1.7800729040097205, + "grad_norm": 50.2479403169235, + "learning_rate": 6.465540919862456e-06, + "loss": 1.0675, + "step": 733 + }, + { + "epoch": 1.7825030376670719, + "grad_norm": 76.05847584461722, + "learning_rate": 6.32418607319546e-06, + "loss": 1.0962, + "step": 734 + }, + { + "epoch": 1.7849331713244228, + "grad_norm": 5776.25484119808, + "learning_rate": 6.184343091936751e-06, + "loss": 1.1224, + "step": 735 + }, + { + "epoch": 1.787363304981774, + "grad_norm": 60.26557281969165, + "learning_rate": 6.046014233059161e-06, + "loss": 1.1682, + "step": 736 + }, + { + "epoch": 1.789793438639125, + "grad_norm": 173.76865709745172, + "learning_rate": 5.909201729098579e-06, + "loss": 1.1463, + "step": 737 + }, + { + "epoch": 1.7922235722964763, + "grad_norm": 44.51475254326123, + "learning_rate": 5.77390778811796e-06, + "loss": 1.1127, + "step": 738 + }, + { + "epoch": 1.7946537059538274, + "grad_norm": 62.21753016508825, + "learning_rate": 5.640134593671598e-06, + "loss": 1.1897, + "step": 739 + }, + { + "epoch": 1.7970838396111786, + "grad_norm": 57.213736643350934, + "learning_rate": 5.5078843047700275e-06, + "loss": 1.1004, + "step": 740 + }, + { + "epoch": 1.79951397326853, + "grad_norm": 73.79879091710353, + "learning_rate": 5.3771590558450265e-06, + "loss": 1.2378, + "step": 741 + }, + { + "epoch": 1.8019441069258808, + "grad_norm": 69.33802717306622, + "learning_rate": 5.247960956715259e-06, + "loss": 1.078, + "step": 742 + }, + { + "epoch": 1.8043742405832321, + "grad_norm": 70.1118673770208, + "learning_rate": 5.12029209255227e-06, + "loss": 1.1082, + "step": 743 + }, + { + "epoch": 1.8068043742405833, + "grad_norm": 60.49720666164233, + "learning_rate": 4.994154523846695e-06, + "loss": 1.1694, + "step": 744 + }, + { + "epoch": 1.8092345078979344, + "grad_norm": 62.67576959014564, + "learning_rate": 4.869550286375091e-06, + "loss": 1.1017, + "step": 745 + }, + { + "epoch": 1.8116646415552855, + "grad_norm": 52.797557858037294, + "learning_rate": 4.746481391167068e-06, + "loss": 1.0547, + "step": 746 + }, + { + "epoch": 1.8140947752126366, + "grad_norm": 78.39250293351613, + "learning_rate": 4.624949824472858e-06, + "loss": 1.1395, + "step": 747 + }, + { + "epoch": 1.816524908869988, + "grad_norm": 345.5438304913529, + "learning_rate": 4.504957547731214e-06, + "loss": 1.1248, + "step": 748 + }, + { + "epoch": 1.8189550425273389, + "grad_norm": 128.04277202807285, + "learning_rate": 4.386506497537757e-06, + "loss": 1.2115, + "step": 749 + }, + { + "epoch": 1.8213851761846902, + "grad_norm": 81.9882481842496, + "learning_rate": 4.269598585613776e-06, + "loss": 1.071, + "step": 750 + }, + { + "epoch": 1.8238153098420413, + "grad_norm": 99.80236227862193, + "learning_rate": 4.154235698775277e-06, + "loss": 1.1591, + "step": 751 + }, + { + "epoch": 1.8262454434993924, + "grad_norm": 152.61066223998088, + "learning_rate": 4.040419698902631e-06, + "loss": 1.1322, + "step": 752 + }, + { + "epoch": 1.8286755771567438, + "grad_norm": 32.12973237305346, + "learning_rate": 3.928152422910491e-06, + "loss": 1.0985, + "step": 753 + }, + { + "epoch": 1.8311057108140947, + "grad_norm": 42.81358112745556, + "learning_rate": 3.817435682718096e-06, + "loss": 1.1252, + "step": 754 + }, + { + "epoch": 1.833535844471446, + "grad_norm": 54.37793958217706, + "learning_rate": 3.7082712652200867e-06, + "loss": 1.1263, + "step": 755 + }, + { + "epoch": 1.8359659781287971, + "grad_norm": 76.10189962024336, + "learning_rate": 3.6006609322576156e-06, + "loss": 1.2002, + "step": 756 + }, + { + "epoch": 1.8383961117861483, + "grad_norm": 65.3082792846633, + "learning_rate": 3.4946064205899965e-06, + "loss": 1.074, + "step": 757 + }, + { + "epoch": 1.8408262454434994, + "grad_norm": 51.04243707129297, + "learning_rate": 3.390109441866618e-06, + "loss": 1.1253, + "step": 758 + }, + { + "epoch": 1.8432563791008505, + "grad_norm": 162.25838422994087, + "learning_rate": 3.287171682599255e-06, + "loss": 1.0746, + "step": 759 + }, + { + "epoch": 1.8456865127582018, + "grad_norm": 53.42288615863692, + "learning_rate": 3.1857948041349894e-06, + "loss": 1.0434, + "step": 760 + }, + { + "epoch": 1.8481166464155527, + "grad_norm": 47.70669365600897, + "learning_rate": 3.085980442629288e-06, + "loss": 1.0694, + "step": 761 + }, + { + "epoch": 1.850546780072904, + "grad_norm": 35.5865078619899, + "learning_rate": 2.9877302090196346e-06, + "loss": 1.1292, + "step": 762 + }, + { + "epoch": 1.8529769137302552, + "grad_norm": 259.96541089150685, + "learning_rate": 2.8910456889995498e-06, + "loss": 1.1138, + "step": 763 + }, + { + "epoch": 1.8554070473876063, + "grad_norm": 100.9818148806922, + "learning_rate": 2.7959284429929456e-06, + "loss": 1.1414, + "step": 764 + }, + { + "epoch": 1.8578371810449574, + "grad_norm": 58.16414187512478, + "learning_rate": 2.7023800061289907e-06, + "loss": 1.1076, + "step": 765 + }, + { + "epoch": 1.8602673147023085, + "grad_norm": 48.78094545883972, + "learning_rate": 2.6104018882173064e-06, + "loss": 1.1061, + "step": 766 + }, + { + "epoch": 1.86269744835966, + "grad_norm": 32.1696048844329, + "learning_rate": 2.5199955737236104e-06, + "loss": 1.0771, + "step": 767 + }, + { + "epoch": 1.8651275820170108, + "grad_norm": 67.39153628388367, + "learning_rate": 2.4311625217457778e-06, + "loss": 1.1179, + "step": 768 + }, + { + "epoch": 1.8675577156743621, + "grad_norm": 57.3659391222766, + "learning_rate": 2.3439041659902407e-06, + "loss": 1.1348, + "step": 769 + }, + { + "epoch": 1.8699878493317132, + "grad_norm": 48.86332839622862, + "learning_rate": 2.2582219147489147e-06, + "loss": 1.067, + "step": 770 + }, + { + "epoch": 1.8724179829890644, + "grad_norm": 59.25100384913438, + "learning_rate": 2.174117150876398e-06, + "loss": 1.0814, + "step": 771 + }, + { + "epoch": 1.8748481166464157, + "grad_norm": 63.612150673079476, + "learning_rate": 2.091591231767709e-06, + "loss": 1.149, + "step": 772 + }, + { + "epoch": 1.8772782503037666, + "grad_norm": 76.64767939109268, + "learning_rate": 2.010645489336382e-06, + "loss": 1.1809, + "step": 773 + }, + { + "epoch": 1.879708383961118, + "grad_norm": 73.74931792569282, + "learning_rate": 1.9312812299929094e-06, + "loss": 1.1327, + "step": 774 + }, + { + "epoch": 1.882138517618469, + "grad_norm": 58.80672287733774, + "learning_rate": 1.8534997346237093e-06, + "loss": 1.0941, + "step": 775 + }, + { + "epoch": 1.8845686512758202, + "grad_norm": 38.760707938386155, + "learning_rate": 1.777302258570479e-06, + "loss": 1.0785, + "step": 776 + }, + { + "epoch": 1.8869987849331713, + "grad_norm": 78.58894037195526, + "learning_rate": 1.7026900316098215e-06, + "loss": 1.1067, + "step": 777 + }, + { + "epoch": 1.8894289185905224, + "grad_norm": 34.985018454660455, + "learning_rate": 1.6296642579335496e-06, + "loss": 1.0454, + "step": 778 + }, + { + "epoch": 1.8918590522478738, + "grad_norm": 56.94816404212683, + "learning_rate": 1.5582261161291245e-06, + "loss": 1.1402, + "step": 779 + }, + { + "epoch": 1.8942891859052247, + "grad_norm": 88.47683213840897, + "learning_rate": 1.4883767591606924e-06, + "loss": 1.1847, + "step": 780 + }, + { + "epoch": 1.896719319562576, + "grad_norm": 164.4313185857989, + "learning_rate": 1.4201173143504888e-06, + "loss": 1.0246, + "step": 781 + }, + { + "epoch": 1.8991494532199271, + "grad_norm": 44.826186028268175, + "learning_rate": 1.3534488833605974e-06, + "loss": 1.1285, + "step": 782 + }, + { + "epoch": 1.9015795868772782, + "grad_norm": 75.6977790059232, + "learning_rate": 1.2883725421752201e-06, + "loss": 1.111, + "step": 783 + }, + { + "epoch": 1.9040097205346294, + "grad_norm": 74.26685984847175, + "learning_rate": 1.2248893410832685e-06, + "loss": 1.0741, + "step": 784 + }, + { + "epoch": 1.9064398541919805, + "grad_norm": 54.63352143603637, + "learning_rate": 1.1630003046614323e-06, + "loss": 1.0816, + "step": 785 + }, + { + "epoch": 1.9088699878493318, + "grad_norm": 55.31777597928668, + "learning_rate": 1.1027064317576385e-06, + "loss": 1.1589, + "step": 786 + }, + { + "epoch": 1.9113001215066827, + "grad_norm": 81.61311763219298, + "learning_rate": 1.0440086954749517e-06, + "loss": 1.1165, + "step": 787 + }, + { + "epoch": 1.913730255164034, + "grad_norm": 49.503370916119344, + "learning_rate": 9.869080431558542e-07, + "loss": 1.1239, + "step": 788 + }, + { + "epoch": 1.9161603888213852, + "grad_norm": 47.648410246233475, + "learning_rate": 9.314053963669245e-07, + "loss": 1.1121, + "step": 789 + }, + { + "epoch": 1.9185905224787363, + "grad_norm": 66.83906080469764, + "learning_rate": 8.775016508840272e-07, + "loss": 1.1461, + "step": 790 + }, + { + "epoch": 1.9210206561360876, + "grad_norm": 148.92765988977712, + "learning_rate": 8.251976766777913e-07, + "loss": 1.1627, + "step": 791 + }, + { + "epoch": 1.9234507897934385, + "grad_norm": 70.09639733538229, + "learning_rate": 7.744943178996101e-07, + "loss": 1.0935, + "step": 792 + }, + { + "epoch": 1.9258809234507899, + "grad_norm": 85.9264341638202, + "learning_rate": 7.253923928680406e-07, + "loss": 1.1071, + "step": 793 + }, + { + "epoch": 1.928311057108141, + "grad_norm": 76.17401772146403, + "learning_rate": 6.778926940555152e-07, + "loss": 1.1448, + "step": 794 + }, + { + "epoch": 1.930741190765492, + "grad_norm": 91.15259575797278, + "learning_rate": 6.319959880756177e-07, + "loss": 1.1101, + "step": 795 + }, + { + "epoch": 1.9331713244228432, + "grad_norm": 31.655747702448462, + "learning_rate": 5.877030156707042e-07, + "loss": 1.039, + "step": 796 + }, + { + "epoch": 1.9356014580801943, + "grad_norm": 50.84824944272368, + "learning_rate": 5.450144916999134e-07, + "loss": 1.0511, + "step": 797 + }, + { + "epoch": 1.9380315917375457, + "grad_norm": 65.68864220454576, + "learning_rate": 5.039311051276752e-07, + "loss": 1.1926, + "step": 798 + }, + { + "epoch": 1.9404617253948966, + "grad_norm": 87.84909056776107, + "learning_rate": 4.644535190125421e-07, + "loss": 1.1022, + "step": 799 + }, + { + "epoch": 1.942891859052248, + "grad_norm": 49.60648808282262, + "learning_rate": 4.2658237049655323e-07, + "loss": 1.1283, + "step": 800 + }, + { + "epoch": 1.945321992709599, + "grad_norm": 60.29916671699956, + "learning_rate": 3.903182707948649e-07, + "loss": 1.0659, + "step": 801 + }, + { + "epoch": 1.9477521263669502, + "grad_norm": 85.60352537490172, + "learning_rate": 3.556618051859584e-07, + "loss": 1.1473, + "step": 802 + }, + { + "epoch": 1.9501822600243013, + "grad_norm": 63.26392365183893, + "learning_rate": 3.2261353300219176e-07, + "loss": 1.1018, + "step": 803 + }, + { + "epoch": 1.9526123936816524, + "grad_norm": 58.902160704029676, + "learning_rate": 2.9117398762069647e-07, + "loss": 1.1158, + "step": 804 + }, + { + "epoch": 1.9550425273390037, + "grad_norm": 95.75043109531097, + "learning_rate": 2.613436764548505e-07, + "loss": 1.1034, + "step": 805 + }, + { + "epoch": 1.9574726609963546, + "grad_norm": 61.028532599630765, + "learning_rate": 2.3312308094607382e-07, + "loss": 1.1239, + "step": 806 + }, + { + "epoch": 1.959902794653706, + "grad_norm": 42.40393099941929, + "learning_rate": 2.0651265655603492e-07, + "loss": 1.0899, + "step": 807 + }, + { + "epoch": 1.962332928311057, + "grad_norm": 72.5324497099459, + "learning_rate": 1.8151283275928964e-07, + "loss": 1.0923, + "step": 808 + }, + { + "epoch": 1.9647630619684082, + "grad_norm": 70.09629889999877, + "learning_rate": 1.5812401303639813e-07, + "loss": 1.122, + "step": 809 + }, + { + "epoch": 1.9671931956257596, + "grad_norm": 51.82649450794088, + "learning_rate": 1.3634657486737424e-07, + "loss": 1.1976, + "step": 810 + }, + { + "epoch": 1.9696233292831105, + "grad_norm": 38.14749887357619, + "learning_rate": 1.1618086972559062e-07, + "loss": 1.1402, + "step": 811 + }, + { + "epoch": 1.9720534629404618, + "grad_norm": 63.952260610433626, + "learning_rate": 9.762722307213868e-08, + "loss": 1.1099, + "step": 812 + }, + { + "epoch": 1.974483596597813, + "grad_norm": 67.73936930746979, + "learning_rate": 8.068593435055505e-08, + "loss": 1.0666, + "step": 813 + }, + { + "epoch": 1.976913730255164, + "grad_norm": 199.80849466800598, + "learning_rate": 6.535727698199213e-08, + "loss": 1.1676, + "step": 814 + }, + { + "epoch": 1.9793438639125152, + "grad_norm": 128.36184695966296, + "learning_rate": 5.164149836077714e-08, + "loss": 1.1347, + "step": 815 + }, + { + "epoch": 1.9817739975698663, + "grad_norm": 117.93650898577977, + "learning_rate": 3.953881985047092e-08, + "loss": 1.0093, + "step": 816 + }, + { + "epoch": 1.9842041312272176, + "grad_norm": 133.66858791525644, + "learning_rate": 2.9049436780281825e-08, + "loss": 1.132, + "step": 817 + }, + { + "epoch": 1.9866342648845685, + "grad_norm": 45.0102112528463, + "learning_rate": 2.0173518441868324e-08, + "loss": 1.1243, + "step": 818 + }, + { + "epoch": 1.9890643985419199, + "grad_norm": 99.86626236714106, + "learning_rate": 1.2911208086663351e-08, + "loss": 1.1503, + "step": 819 + }, + { + "epoch": 1.991494532199271, + "grad_norm": 76.9095216170618, + "learning_rate": 7.262622923531747e-09, + "loss": 1.1219, + "step": 820 + }, + { + "epoch": 1.993924665856622, + "grad_norm": 53.6297519446309, + "learning_rate": 3.2278541168717646e-09, + "loss": 1.1142, + "step": 821 + }, + { + "epoch": 1.9963547995139734, + "grad_norm": 113.49011128083137, + "learning_rate": 8.069667851939855e-10, + "loss": 1.1222, + "step": 822 + } + ], + "logging_steps": 1, + "max_steps": 822, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 206, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.218071900027093e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-822/training_args.bin b/checkpoint-822/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..529c57f6a4b7b9fa2912b10c5ebbd4c9ae92b0f2 --- /dev/null +++ b/checkpoint-822/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6cfbae5f5972dd850bae3d0987f916904b4b5b8d723c11ef16db54c57724a76 +size 8568 diff --git a/checkpoint-822/zero_to_fp32.py b/checkpoint-822/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-822/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..cdf48b3a202fbd0a012bd0c34f94c036b0ff0d8c --- /dev/null +++ b/config.json @@ -0,0 +1,28 @@ +{ + "_attn_implementation_autoset": true, + "architectures": [ + "Glm4ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "eos_token_id": 151336, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 6144, + "initializer_range": 0.02, + "intermediate_size": 23040, + "max_position_embeddings": 32768, + "model_type": "glm4", + "num_attention_heads": 48, + "num_hidden_layers": 61, + "num_key_value_heads": 2, + "pad_token_id": 151329, + "partial_rotary_factor": 0.5, + "rms_norm_eps": 1e-05, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "use_cache": false, + "vocab_size": 151552 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..243977cf4d01fec6edb0a8f97e440826b936aee7 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,11 @@ +{ + "_from_model_config": true, + "do_sample": true, + "eos_token_id": [ + 151329, + 151336, + 151338 + ], + "pad_token_id": 151329, + "transformers_version": "4.51.3" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7930155509434a0398f24149f374be2488e61633 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ae215fc1360a879202bd292aed15250010a2d5206626b9aa54c8d5f4acc0df2 +size 57136 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3cf953c2c8a3c89778c92e54c685942bb1130616 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,32 @@ +{ + "additional_special_tokens": [ + "<|endoftext|>", + "[MASK]", + "[gMASK]", + "[sMASK]", + "", + "", + "<|system|>", + "<|user|>", + "<|assistant|>", + "<|observation|>", + "<|begin_of_image|>", + "<|end_of_image|>", + "<|begin_of_video|>", + "<|end_of_video|>" + ], + "eos_token": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..4d1dde37a2715c11628fc84bf571976f9f80eb69 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76ebeac0d8bd7879ead7b43c16b44981f277e47225de2bd7de9ae1a6cc664a8c +size 19966496 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e19c45f6310a17f6c1c6be76a429ac68438d547f --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,146 @@ +{ + "added_tokens_decoder": { + "151329": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151330": { + "content": "[MASK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151331": { + "content": "[gMASK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151332": { + "content": "[sMASK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151333": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151334": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151335": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151336": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151337": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151338": { + "content": "<|observation|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151339": { + "content": "<|begin_of_image|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151340": { + "content": "<|end_of_image|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151341": { + "content": "<|begin_of_video|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151342": { + "content": "<|end_of_video|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|endoftext|>", + "[MASK]", + "[gMASK]", + "[sMASK]", + "", + "", + "<|system|>", + "<|user|>", + "<|assistant|>", + "<|observation|>", + "<|begin_of_image|>", + "<|end_of_image|>", + "<|begin_of_video|>", + "<|end_of_video|>" + ], + "chat_template": "[gMASK]\n{%- if tools -%}\n<|system|>\n# 可用工具\n{% for tool in tools %}\n {%- set function = tool.function if tool.get(\"function\") else tool %}\n\n## {{ function.name }}\n\n{{ function | tojson(indent=4, ensure_ascii=False) }}\n在调用上述函数时,请使用 Json 格式表示调用的参数。\n{%- endfor %}\n{%- endif -%}\n\n{%- for msg in messages %}\n {%- if msg.role == 'system' %}\n<|system|>\n{{ msg.content }}\n {%- endif %}\n{%- endfor %}\n\n{%- for message in messages if message.role != 'system' %}\n {%- set role = message['role'] %}\n {%- set content = message['content'] %}\n {%- set meta = message.get(\"metadata\", \"\") %}\n\n {%- if role == 'user' %}\n<|user|>\n{{ content }}\n {%- elif role == 'assistant' and not meta %}\n<|assistant|>\n{{ content }}\n {%- elif role == 'assistant' and meta %}\n<|assistant|>{{ meta }}\n{{ content }}\n {%- elif role == 'observation' %}\n<|observation|>\n{{ content }}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}<|assistant|>{% endif %}", + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "eos_token": "<|user|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 128000, + "pad_token": "<|endoftext|>", + "padding_side": "left", + "remove_space": false, + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..529c57f6a4b7b9fa2912b10c5ebbd4c9ae92b0f2 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6cfbae5f5972dd850bae3d0987f916904b4b5b8d723c11ef16db54c57724a76 +size 8568